Metric

Metric ¶

Base class for metrics.

Subclasses must implement the evaluate method to perform metric computation. Use utility functions from flexeval.core.metric.utils for common patterns like string processing and category-wise aggregation.

Source code in flexeval/core/metric/base.py

class Metric(ABC):
    """
    Base class for metrics.

    Subclasses must implement the `evaluate` method to perform metric computation.
    Use utility functions from `flexeval.core.metric.utils` for common patterns
    like string processing and category-wise aggregation.
    """

    @abstractmethod
    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        """
        Evaluate the outputs of `LanguageModel` against the references.

        Args:
            lm_outputs: List of model outputs.
            references_list: List of reference outputs.
            extra_info_list: List of task inputs and some extra information.
        """

evaluate `abstractmethod` ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Evaluate the outputs of LanguageModel against the references.

Parameters:

lm_outputs (list[str]) –

List of model outputs.
references_list (list[list[str]]) –

List of reference outputs.
extra_info_list (list[dict[str, str]] | None, default: None ) –

List of task inputs and some extra information.

Source code in flexeval/core/metric/base.py

@abstractmethod
def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    """
    Evaluate the outputs of `LanguageModel` against the references.

    Args:
        lm_outputs: List of model outputs.
        references_list: List of reference outputs.
        extra_info_list: List of task inputs and some extra information.
    """

MetricResult `dataclass` ¶

A dataclass representing the result of a metric evaluation.

Source code in flexeval/core/metric/base.py

@dataclass
class MetricResult:
    """
    A dataclass representing the result of a metric evaluation.
    """

    summary: dict[str, Any]
    """
    Summary containing aggregated metric values.
    """
    instance_details: list[dict[str, Any]] | None = None
    """
    A list of evaluate details for each instance.
    Useful for error analysis.
    """

summary `instance-attribute` ¶

summary: dict[str, Any]

Summary containing aggregated metric values.

instance_details `class-attribute` `instance-attribute` ¶

instance_details: list[dict[str, Any]] | None = None

A list of evaluate details for each instance. Useful for error analysis.

init ¶

__init__(
    summary: dict[str, Any],
    instance_details: list[dict[str, Any]] | None = None,
) -> None

BLEU ¶

An implementation of BLEU. The calculation is based on the sacrebleu library.

Parameters:

tokenize_option (str | None, default: None ) –

Tokenization option for sacrebleu. If None, sacrebleu will use the default tokenization. For details, see sacreBLEU https://github.com/mjpost/sacrebleu/blob/aa3cc4351af6/sacrebleu/sacrebleu.py#L121-L124
lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

StringProcessor or list of StringProcessor to apply to the references before comparison.
category_key (str | None, default: None ) –

A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import BLEU
>>> bleu = BLEU()
>>> lm_outputs = ["I am a student .", "I am a teacher ."]
>>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
>>> result = bleu.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={
        'bleu_score': 100.0,
        'bleu_bp': 1.0,
        'bleu_signature': nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.1},
        instance_details=[
            {'bleu_score': 100.0, 'bleu_bp': 1.0},
            {'bleu_score': 100.0, 'bleu_bp': 1.0}
        ]
    )

Source code in flexeval/core/metric/bleu.py

class BLEU(Metric):
    """An implementation of [BLEU](https://aclanthology.org/P02-1040/).
    The calculation is based on the [sacrebleu](https://github.com/mjpost/sacrebleu) library.

    Args:
        tokenize_option: Tokenization option for sacrebleu.
            If `None`, sacrebleu will use the default tokenization.
            For details, see sacreBLEU
            https://github.com/mjpost/sacrebleu/blob/aa3cc4351af6/sacrebleu/sacrebleu.py#L121-L124
        lm_output_processor:
            StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
        reference_processor: StringProcessor or list of StringProcessor to apply to the references before comparison.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import BLEU
        >>> bleu = BLEU()
        >>> lm_outputs = ["I am a student .", "I am a teacher ."]
        >>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
        >>> result = bleu.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={
                'bleu_score': 100.0,
                'bleu_bp': 1.0,
                'bleu_signature': nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.1},
                instance_details=[
                    {'bleu_score': 100.0, 'bleu_bp': 1.0},
                    {'bleu_score': 100.0, 'bleu_bp': 1.0}
                ]
            )
    """

    def __init__(
        self,
        tokenize_option: str | None = None,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
        category_key: str | None = None,
    ) -> None:
        self._corpus_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option)
        # For sentence BLEU, we need to set `effective_order=True` as recommended by sacrebleu.
        self._sentence_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option, effective_order=True)

        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
        references_list = [
            [apply_string_processors(ref, self.reference_processors) for ref in references]
            for references in references_list
        ]

        # Restructure references for sacrebleu format
        max_num_refs = max(len(refs) for refs in references_list)
        references_for_sacrebleu: list[list[str]] = []
        for i in range(max_num_refs):
            set_of_references: list[str] = []
            for refs_for_source in references_list:
                if i < len(refs_for_source):
                    set_of_references.append(refs_for_source[i])
                else:
                    set_of_references.append("")
            references_for_sacrebleu.append(set_of_references)

        # Compute metrics
        bleu = self._corpus_bleu.corpus_score([o.strip() for o in lm_outputs], references_for_sacrebleu)
        sentence_bleu_list = [
            self._sentence_bleu.sentence_score(o.strip(), refs) for o, refs in zip(lm_outputs, references_list)
        ]

        summary = {
            "bleu_score": bleu.score,
            "bleu_bp": bleu.bp,
            "bleu_signature": self._corpus_bleu.get_signature(),
        }

        if self.category_key:
            categories = [extra_info[self.category_key] for extra_info in extra_info_list]
            sentence_bleu_score_list = [b.score for b in sentence_bleu_list]
            category_wise_scores = aggregate_category_wise_scores(sentence_bleu_score_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"sentence_bleu_score/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"bleu_score": b.score, "bleu_bp": b.bp} for b in sentence_bleu_list],
        )

lm_output_processors `instance-attribute` ¶

lm_output_processors = lm_output_processor

reference_processors `instance-attribute` ¶

reference_processors = reference_processor

category_key `instance-attribute` ¶

category_key = category_key

init ¶

__init__(
    tokenize_option: str | None = None,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    category_key: str | None = None,
) -> None

Source code in flexeval/core/metric/bleu.py

def __init__(
    self,
    tokenize_option: str | None = None,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
    category_key: str | None = None,
) -> None:
    self._corpus_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option)
    # For sentence BLEU, we need to set `effective_order=True` as recommended by sacrebleu.
    self._sentence_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option, effective_order=True)

    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor
    self.category_key = category_key

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/bleu.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
    references_list = [
        [apply_string_processors(ref, self.reference_processors) for ref in references]
        for references in references_list
    ]

    # Restructure references for sacrebleu format
    max_num_refs = max(len(refs) for refs in references_list)
    references_for_sacrebleu: list[list[str]] = []
    for i in range(max_num_refs):
        set_of_references: list[str] = []
        for refs_for_source in references_list:
            if i < len(refs_for_source):
                set_of_references.append(refs_for_source[i])
            else:
                set_of_references.append("")
        references_for_sacrebleu.append(set_of_references)

    # Compute metrics
    bleu = self._corpus_bleu.corpus_score([o.strip() for o in lm_outputs], references_for_sacrebleu)
    sentence_bleu_list = [
        self._sentence_bleu.sentence_score(o.strip(), refs) for o, refs in zip(lm_outputs, references_list)
    ]

    summary = {
        "bleu_score": bleu.score,
        "bleu_bp": bleu.bp,
        "bleu_signature": self._corpus_bleu.get_signature(),
    }

    if self.category_key:
        categories = [extra_info[self.category_key] for extra_info in extra_info_list]
        sentence_bleu_score_list = [b.score for b in sentence_bleu_list]
        category_wise_scores = aggregate_category_wise_scores(sentence_bleu_score_list, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"sentence_bleu_score/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"bleu_score": b.score, "bleu_bp": b.bp} for b in sentence_bleu_list],
    )

CharF1 ¶

A metric that calculates how many characters in the output string are included in the characters of the expected output. If there are multiple expected outputs, the highest score is adopted.

Parameters:

lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

StringProcessor or list of Normalizers to apply to the model outputs before comparison.
reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

StringProcessor or list of Normalizers to apply to the references before comparison.
category_key (str | None, default: None ) –

A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import CharF1
>>> char_f1 = CharF1()
>>> lm_outputs = ["abcd", "efgh"]
>>> references_list = [["abcd", "ABCD"], ["efGH"]]
>>> result = char_f1.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(summary={'char_f1': 0.75}, instance_details=[{'char_f1': 1.0}, {'char_f1': 0.5}])

Source code in flexeval/core/metric/char_f1.py

class CharF1(Metric):
    """
    A metric that calculates how many characters in the output string are included
    in the characters of the expected output.
    If there are multiple expected outputs, the highest score is adopted.

    Args:
        lm_output_processor: StringProcessor or list of Normalizers to apply to the model outputs before comparison.
        reference_processor: StringProcessor or list of Normalizers to apply to the references before comparison.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import CharF1
        >>> char_f1 = CharF1()
        >>> lm_outputs = ["abcd", "efgh"]
        >>> references_list = [["abcd", "ABCD"], ["efGH"]]
        >>> result = char_f1.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(summary={'char_f1': 0.75}, instance_details=[{'char_f1': 1.0}, {'char_f1': 0.5}])
    """

    def __init__(
        self,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
        references_list = [
            [apply_string_processors(ref, self.reference_processors) for ref in references]
            for references in references_list
        ]

        # Compute metrics
        char_f1_scores: list[float] = []
        for lm_output, expected_output in zip(lm_outputs, references_list):
            score = max(fuzz.ratio(lm_output, o) for o in expected_output) / 100
            char_f1_scores.append(score)

        summary = {"char_f1": sum(char_f1_scores) / len(char_f1_scores)}

        if self.category_key:
            categories = [extra_info[self.category_key] for extra_info in extra_info_list]
            category_wise_scores = aggregate_category_wise_scores(char_f1_scores, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"char_f1/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"char_f1": s} for s in char_f1_scores],
        )

lm_output_processors `instance-attribute` ¶

lm_output_processors = lm_output_processor

reference_processors `instance-attribute` ¶

reference_processors = reference_processor

category_key `instance-attribute` ¶

category_key = category_key

init ¶

__init__(
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    category_key: str | None = None,
) -> None

Source code in flexeval/core/metric/char_f1.py

def __init__(
    self,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
    category_key: str | None = None,
) -> None:
    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor
    self.category_key = category_key

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/char_f1.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
    references_list = [
        [apply_string_processors(ref, self.reference_processors) for ref in references]
        for references in references_list
    ]

    # Compute metrics
    char_f1_scores: list[float] = []
    for lm_output, expected_output in zip(lm_outputs, references_list):
        score = max(fuzz.ratio(lm_output, o) for o in expected_output) / 100
        char_f1_scores.append(score)

    summary = {"char_f1": sum(char_f1_scores) / len(char_f1_scores)}

    if self.category_key:
        categories = [extra_info[self.category_key] for extra_info in extra_info_list]
        category_wise_scores = aggregate_category_wise_scores(char_f1_scores, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"char_f1/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"char_f1": s} for s in char_f1_scores],
    )

CodeEval ¶

A metric that evaluates generated code with test cases.

Parameters:

code_template (str | None, default: None ) –

A Jinja2 template string to make the generated code. The template can contain variables from extra_info. If None, the code prompt will be the generated text itself.
lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

String processors applied to model outputs before evaluation.
evaluate_module (str, default: 'code_eval' ) –

An evaluate module to use.

Examples:

>>> from flexeval import CodeEval
>>> code_eval = CodeEval()
>>> lm_outputs = ["def add(a, b):\n    return a + b", "def is_equal(a, b):\n    return a = b"]
>>> references_list = [["assert add(1, 2) == 3"], ["assert is_equal(1, 2) == False"]]
>>> result = code_eval.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'pass@1': 0.5},
    instance_details=[
        {'passed': True, 'result': 'passed'},
        {'passed': False, 'result': 'failed: invalid syntax (<string>, line 2)'}
    ]
)

Source code in flexeval/core/metric/code_eval.py

class CodeEval(Metric):
    """
    A metric that evaluates generated code with test cases.

    Args:
        code_template: A Jinja2 template string to make the generated code.
            The template can contain variables from extra_info.
            If `None`, the code prompt will be the generated text itself.
        lm_output_processor: String processors applied to model outputs before evaluation.
        evaluate_module: An evaluate module to use.

    Examples:
        >>> from flexeval import CodeEval
        >>> code_eval = CodeEval()
        >>> lm_outputs = ["def add(a, b):\\n    return a + b", "def is_equal(a, b):\\n    return a = b"]
        >>> references_list = [["assert add(1, 2) == 3"], ["assert is_equal(1, 2) == False"]]
        >>> result = code_eval.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'pass@1': 0.5},
            instance_details=[
                {'passed': True, 'result': 'passed'},
                {'passed': False, 'result': 'failed: invalid syntax (<string>, line 2)'}
            ]
        )
    """

    def __init__(
        self,
        code_template: str | None = None,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        evaluate_module: str = "code_eval",
    ) -> None:
        if code_template is None:
            code_template = "{{ lm_output }}"

        self.code_template = JINJA2_ENV.from_string(code_template)
        self.code_eval = evaluate.load(evaluate_module)

        self.lm_output_processors = lm_output_processor

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]

        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]

        # Compute metrics
        generated_code_list: list[str] = []
        test_case_list: list[str] = []
        # in code generation tasks, references_list contains the test cases
        for lm_output, extra_info, test_cases in zip(
            lm_outputs,
            extra_info_list,
            references_list,
        ):
            generated_code = self.code_template.render(lm_output=lm_output, **extra_info)
            generated_code_list.append(generated_code)
            test_case_list.append("\n".join(test_cases))
        pass_at_k, results = self.code_eval.compute(
            references=test_case_list,
            predictions=[[c] for c in generated_code_list],
            k=[1],
        )

        # `results` contain the detailed results for each test case
        # e.g., {0: [(0, {'task_id': 0, 'passed': False, 'result': "failed", 'completion_id': 0})]}
        results: dict[int, list[tuple[int, dict[str, Any]]]]

        instance_details: list[dict[str, Any]] = []
        for i in range(len(lm_outputs)):
            first_result = results[i][0]  # we only assume one candidate code per instance, so we take the first result
            _, detail_result = first_result  # the first element is just the index so we ignore it
            # remove unnecessary fields to save space
            detail_result.pop("completion_id")
            detail_result.pop("task_id")
            instance_details.append(detail_result)

        return MetricResult(pass_at_k, instance_details=instance_details)

code_template `instance-attribute` ¶

code_template = from_string(code_template)

code_eval `instance-attribute` ¶

code_eval = load(evaluate_module)

lm_output_processors `instance-attribute` ¶

lm_output_processors = lm_output_processor

init ¶

__init__(
    code_template: str | None = None,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    evaluate_module: str = "code_eval",
) -> None

Source code in flexeval/core/metric/code_eval.py

def __init__(
    self,
    code_template: str | None = None,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    evaluate_module: str = "code_eval",
) -> None:
    if code_template is None:
        code_template = "{{ lm_output }}"

    self.code_template = JINJA2_ENV.from_string(code_template)
    self.code_eval = evaluate.load(evaluate_module)

    self.lm_output_processors = lm_output_processor

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/code_eval.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]

    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]

    # Compute metrics
    generated_code_list: list[str] = []
    test_case_list: list[str] = []
    # in code generation tasks, references_list contains the test cases
    for lm_output, extra_info, test_cases in zip(
        lm_outputs,
        extra_info_list,
        references_list,
    ):
        generated_code = self.code_template.render(lm_output=lm_output, **extra_info)
        generated_code_list.append(generated_code)
        test_case_list.append("\n".join(test_cases))
    pass_at_k, results = self.code_eval.compute(
        references=test_case_list,
        predictions=[[c] for c in generated_code_list],
        k=[1],
    )

    # `results` contain the detailed results for each test case
    # e.g., {0: [(0, {'task_id': 0, 'passed': False, 'result': "failed", 'completion_id': 0})]}
    results: dict[int, list[tuple[int, dict[str, Any]]]]

    instance_details: list[dict[str, Any]] = []
    for i in range(len(lm_outputs)):
        first_result = results[i][0]  # we only assume one candidate code per instance, so we take the first result
        _, detail_result = first_result  # the first element is just the index so we ignore it
        # remove unnecessary fields to save space
        detail_result.pop("completion_id")
        detail_result.pop("task_id")
        instance_details.append(detail_result)

    return MetricResult(pass_at_k, instance_details=instance_details)

CommonPrefixLength ¶

A metric that calculates the length of the longest common prefix between the model output and the reference.

Examples:

>>> from flexeval import CommonPrefixLength
>>> common_prefix_length = CommonPrefixLength()
>>> lm_outputs = ["ABCDEFG"]
>>> references_list = [["ABCdefg"]]
>>> result = common_prefix_length.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={"average_common_prefix_length": 3.0, "longest_common_prefix_length": 3},
    instance_details=[{"common_prefix_length": 3}],
)

Source code in flexeval/core/metric/common_prefix_length.py

class CommonPrefixLength(Metric):
    """
    A metric that calculates the length of the longest common prefix between the model output and the reference.

    Examples:
        >>> from flexeval import CommonPrefixLength
        >>> common_prefix_length = CommonPrefixLength()
        >>> lm_outputs = ["ABCDEFG"]
        >>> references_list = [["ABCdefg"]]
        >>> result = common_prefix_length.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={"average_common_prefix_length": 3.0, "longest_common_prefix_length": 3},
            instance_details=[{"common_prefix_length": 3}],
        )
    """

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        common_prefix_length_list: list[int] = []
        for lm_output, references in zip(lm_outputs, references_list):
            common_prefix_length = max(len(get_longest_common_prefix(lm_output, gt)) for gt in references)
            common_prefix_length_list.append(common_prefix_length)

        return MetricResult(
            {
                "average_common_prefix_length": sum(common_prefix_length_list) / len(common_prefix_length_list),
                "longest_common_prefix_length": max(common_prefix_length_list),
            },
            instance_details=[{"common_prefix_length": s} for s in common_prefix_length_list],
        )

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/common_prefix_length.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    common_prefix_length_list: list[int] = []
    for lm_output, references in zip(lm_outputs, references_list):
        common_prefix_length = max(len(get_longest_common_prefix(lm_output, gt)) for gt in references)
        common_prefix_length_list.append(common_prefix_length)

    return MetricResult(
        {
            "average_common_prefix_length": sum(common_prefix_length_list) / len(common_prefix_length_list),
            "longest_common_prefix_length": max(common_prefix_length_list),
        },
        instance_details=[{"common_prefix_length": s} for s in common_prefix_length_list],
    )

CommonStringLength ¶

A metric that calculates the length of the longest common substring between the model output and the reference.

Examples:

>>> from flexeval import CommonStringLength
>>> common_string_length = CommonStringLength()
>>> lm_outputs = ["aBCDEFG"]
>>> references_list = [["ABCDefg"]]
>>> result = common_string_length.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={"average_common_string_length": 3.0, "longest_common_string_length": 3},
    instance_details=[{"common_string_length": 3}],
)

Source code in flexeval/core/metric/common_string_length.py

class CommonStringLength(Metric):
    """
    A metric that calculates the length of the longest common substring between the model output and the reference.

    Examples:
        >>> from flexeval import CommonStringLength
        >>> common_string_length = CommonStringLength()
        >>> lm_outputs = ["aBCDEFG"]
        >>> references_list = [["ABCDefg"]]
        >>> result = common_string_length.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={"average_common_string_length": 3.0, "longest_common_string_length": 3},
            instance_details=[{"common_string_length": 3}],
        )
    """

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        common_string_length_list: list[int] = []
        for lm_output, references in zip(lm_outputs, references_list):
            common_string_length = max(len(get_longest_common_substring(lm_output, gt)) for gt in references)
            common_string_length_list.append(common_string_length)

        return MetricResult(
            {
                "average_common_string_length": sum(common_string_length_list) / len(common_string_length_list),
                "longest_common_string_length": max(common_string_length_list),
            },
            instance_details=[{"common_string_length": s} for s in common_string_length_list],
        )

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/common_string_length.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    common_string_length_list: list[int] = []
    for lm_output, references in zip(lm_outputs, references_list):
        common_string_length = max(len(get_longest_common_substring(lm_output, gt)) for gt in references)
        common_string_length_list.append(common_string_length)

    return MetricResult(
        {
            "average_common_string_length": sum(common_string_length_list) / len(common_string_length_list),
            "longest_common_string_length": max(common_string_length_list),
        },
        instance_details=[{"common_string_length": s} for s in common_string_length_list],
    )

Correlation ¶

Correlation metric to compute Pearson, Spearman, or Kendall correlation coefficients. The lm_outputs and references should be numeric values, optionally preprocessed by StringProcessor.

Parameters:

method (Literal['pearson', 'spearman', 'kendall'], default: 'pearson' ) –

The correlation method to use ('pearson', 'spearman', 'kendall').
lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

StringProcessor or a list of StringProcessor to be applied to the model outputs before computing the correlation. If a list is provided, the processors will be applied in order.
reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

StringProcessor or a list of StringProcessor to be applied to the references before computing the correlation. If a list is provided, the processors will be applied in order.

Examples:

>>> from flexeval import Correlation
>>> correlation = Correlation(method='pearson')
>>> lm_outputs = ["1", "2", "3", "4", "5"]
>>> references = [["5"], ["4"], ["3"], ["2"], ["1"]]
>>> result = correlation.evaluate(lm_outputs, references)
>>> print(result)
MetricResult(
    summary={"pearson_correlation": -1.0, "pearson_pvalue": 0.0},
    instance_details=[],
)

Source code in flexeval/core/metric/correlation.py

class Correlation(Metric):
    """
    Correlation metric to compute Pearson, Spearman, or Kendall correlation coefficients.
    The lm_outputs and references should be numeric values, optionally preprocessed by StringProcessor.

    Args:
        method: The correlation method to use ('pearson', 'spearman', 'kendall').
        lm_output_processor: StringProcessor or a list of StringProcessor to be applied to the model outputs before
            computing the correlation. If a list is provided, the processors will be applied in order.
        reference_processor: StringProcessor or a list of StringProcessor to be applied to the references before
            computing the correlation. If a list is provided, the processors will be applied in order.

    Examples:
        >>> from flexeval import Correlation
        >>> correlation = Correlation(method='pearson')
        >>> lm_outputs = ["1", "2", "3", "4", "5"]
        >>> references = [["5"], ["4"], ["3"], ["2"], ["1"]]
        >>> result = correlation.evaluate(lm_outputs, references)
        >>> print(result)
        MetricResult(
            summary={"pearson_correlation": -1.0, "pearson_pvalue": 0.0},
            instance_details=[],
        )
    """

    def __init__(
        self,
        method: Literal["pearson", "spearman", "kendall"] = "pearson",
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
    ) -> None:
        if method not in {"pearson", "spearman", "kendall"}:
            msg = f"Invalid method '{method}'. Choose from 'pearson', 'spearman', 'kendall'."
            raise ValueError(msg)
        self.method = method

        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data - we only use the first reference here
        references = [refs[0] for refs in references_list]
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
        references = [apply_string_processors(ref, self.reference_processors) for ref in references]

        # Convert to numeric values
        lm_outputs_as_float: list[float] = []
        for output in lm_outputs:
            try:
                lm_outputs_as_float.append(float(output))
            except ValueError:  # noqa:PERF203
                warnings.warn(f"Failed to convert model output '{output}' to float. Treating it as 0.", stacklevel=2)
                lm_outputs_as_float.append(0.0)

        references_as_float = [float(ref) for ref in references]

        # Compute metrics
        if self.method == "pearson":
            correlation, pvalue = pearsonr(lm_outputs_as_float, references_as_float)
        elif self.method == "spearman":
            correlation, pvalue = spearmanr(lm_outputs_as_float, references_as_float)
        elif self.method == "kendall":
            correlation, pvalue = kendalltau(lm_outputs_as_float, references_as_float)
        else:
            msg = f"Unsupported method: {self.method}"
            raise ValueError(msg)

        return MetricResult(
            {f"{self.method}_correlation": correlation, f"{self.method}_pvalue": pvalue},
            instance_details=[],
        )

method `instance-attribute` ¶

method = method

lm_output_processors `instance-attribute` ¶

lm_output_processors = lm_output_processor

reference_processors `instance-attribute` ¶

reference_processors = reference_processor

init ¶

__init__(
    method: Literal[
        "pearson", "spearman", "kendall"
    ] = "pearson",
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
) -> None

Source code in flexeval/core/metric/correlation.py

def __init__(
    self,
    method: Literal["pearson", "spearman", "kendall"] = "pearson",
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
) -> None:
    if method not in {"pearson", "spearman", "kendall"}:
        msg = f"Invalid method '{method}'. Choose from 'pearson', 'spearman', 'kendall'."
        raise ValueError(msg)
    self.method = method

    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/correlation.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data - we only use the first reference here
    references = [refs[0] for refs in references_list]
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
    references = [apply_string_processors(ref, self.reference_processors) for ref in references]

    # Convert to numeric values
    lm_outputs_as_float: list[float] = []
    for output in lm_outputs:
        try:
            lm_outputs_as_float.append(float(output))
        except ValueError:  # noqa:PERF203
            warnings.warn(f"Failed to convert model output '{output}' to float. Treating it as 0.", stacklevel=2)
            lm_outputs_as_float.append(0.0)

    references_as_float = [float(ref) for ref in references]

    # Compute metrics
    if self.method == "pearson":
        correlation, pvalue = pearsonr(lm_outputs_as_float, references_as_float)
    elif self.method == "spearman":
        correlation, pvalue = spearmanr(lm_outputs_as_float, references_as_float)
    elif self.method == "kendall":
        correlation, pvalue = kendalltau(lm_outputs_as_float, references_as_float)
    else:
        msg = f"Unsupported method: {self.method}"
        raise ValueError(msg)

    return MetricResult(
        {f"{self.method}_correlation": correlation, f"{self.method}_pvalue": pvalue},
        instance_details=[],
    )

ExactMatch ¶

Exact match metric. If there are multiple references, the output is considered correct if it matches any of the references.

Parameters:

lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
reference_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

StringProcessor or list of StringProcessor to apply to the references before comparison.
category_key (str | None, default: None ) –

A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import ExactMatch
>>> exact_match = ExactMatch()
>>> lm_outputs = ["ABC", "DEF"]
>>> references_list = [["ABC"], ["DEFG"]]
>>> result = exact_match.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={"exact_match": 0.5},
    instance_details=[{"exact_match": True}, {"exact_match": False}],
)

Source code in flexeval/core/metric/exact_match.py

class ExactMatch(Metric):
    """
    Exact match metric.
    If there are multiple references, the output is considered correct if it matches any of the references.

    Args:
        lm_output_processor:
            StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
        reference_processor: StringProcessor or list of StringProcessor to apply to the references before comparison.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import ExactMatch
        >>> exact_match = ExactMatch()
        >>> lm_outputs = ["ABC", "DEF"]
        >>> references_list = [["ABC"], ["DEFG"]]
        >>> result = exact_match.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={"exact_match": 0.5},
            instance_details=[{"exact_match": True}, {"exact_match": False}],
        )
    """

    def __init__(
        self,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
        reference_processor: StringProcessor | list[StringProcessor] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
        references_list = [
            [apply_string_processors(ref, self.reference_processors) for ref in references]
            for references in references_list
        ]

        # Compute metrics
        exact_match_list = [
            lm_output in expected_output for lm_output, expected_output in zip(lm_outputs, references_list)
        ]
        summary = {"exact_match": sum(exact_match_list) / len(exact_match_list)}

        if self.category_key:
            categories = [extra_info[self.category_key] for extra_info in extra_info_list]
            category_wise_scores = aggregate_category_wise_scores(exact_match_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"exact_match/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"exact_match": s} for s in exact_match_list],
        )

lm_output_processors `instance-attribute` ¶

lm_output_processors = lm_output_processor

reference_processors `instance-attribute` ¶

reference_processors = reference_processor

category_key `instance-attribute` ¶

category_key = category_key

init ¶

__init__(
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
    category_key: str | None = None,
) -> None

Source code in flexeval/core/metric/exact_match.py

def __init__(
    self,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    reference_processor: StringProcessor | list[StringProcessor] | None = None,
    category_key: str | None = None,
) -> None:
    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor
    self.category_key = category_key

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/exact_match.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
    references_list = [
        [apply_string_processors(ref, self.reference_processors) for ref in references]
        for references in references_list
    ]

    # Compute metrics
    exact_match_list = [
        lm_output in expected_output for lm_output, expected_output in zip(lm_outputs, references_list)
    ]
    summary = {"exact_match": sum(exact_match_list) / len(exact_match_list)}

    if self.category_key:
        categories = [extra_info[self.category_key] for extra_info in extra_info_list]
        category_wise_scores = aggregate_category_wise_scores(exact_match_list, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"exact_match/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"exact_match": s} for s in exact_match_list],
    )

ChatLLMGEvalScore ¶

A metric that evaluates the output of LanguageModel.batch_generate_chat_response. Unlike ChatLLMScore, this metric let the model output logprobs for all valid scores and calculate weighted score among them. Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20.

Parameters:

language_model (required) –

An instance of LanguageModel to evaluate the output of the model.
prompt_template (required) –

An instance of PromptTemplate to embed the input for the evaluator.
valid_score_range (required) –

A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.
batch_size (int, default: 4 ) –

The batch size for the evaluator.
system_message (str | PromptTemplate | None, default: None ) –

A system message to be prepended to the input for the evaluator.
disable_tqdm (bool, default: False ) –

Whether to disable the progress bar.
category_key (str | None, default: None ) –

A key to create category-wise mean score. The category key is expected to be in extra_info.
prob_threshold (float, default: 0 ) –

For considering low probability among all of valid scores, return None (invalid) if sum of the all probability among vaild scores is less than this value.

Examples:

>>> from flexeval import ChatLLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
>>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nOutput only a number from 1 to 5."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> system_message = "This is the system message."
>>> llm_score = ChatLLMGEvalScore(language_model, prompt_template, [1, 5], system_message=system_message)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> llm_score.evaluate(lm_outputs)
MetricResult(
    summary={'llm_geval_score': 1.179980414173022, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_geval_score': 1.1509989197179789,
            'llm_geval_score_input': [
                {'role': 'system', 'content': 'This is the system message.'},
                {'role': 'user', 'content': 'Evaluate the quality of this text...'}
            ],
            'llm_geval_score_logprobs': {
                '1': -0.06977498531341553,
                '2': -3.687819004058838,
                '3': -3.937819480895996,
                '4': -5.812800884246826,
                '5': -3.937807083129883
            },
            'llm_geval_score_generation_probs': {
                1: 0.932603645815178,
                2: 0.02502652531327666,
                3: 0.01949066821765914,
                4: 0.002989046364034347,
                5: 0.019490909859903
            }
        },
        {
            'llm_geval_score': 1.208961908628065,
            'llm_geval_score_input': [
                {'role': 'system', 'content': 'This is the system message.'},
                {'role': 'user', 'content': 'Evaluate the quality of this text...'}
            ],
            'llm_geval_score_logprobs': {
                '1': -0.13043057918548584,
                '2': -2.8754935264587402,
                '3': -3.000467538833618,
                '4': -4.750283241271973,
                '5': -5.000345706939697
            },
            'llm_geval_score_generation_probs': {
                1: 0.8777174226922144,
                2: 0.05638830351569556,
                3: 0.04976379642068341,
                4: 0.008649245032977617,
                5: 0.006735618046639277
            }
        }
    ])

Source code in flexeval/core/metric/llm_geval_score.py

class ChatLLMGEvalScore(Metric):
    """A metric that evaluates the output of `LanguageModel.batch_generate_chat_response`.
    Unlike ChatLLMScore, this metric let the model output logprobs for all valid scores and
    calculate weighted score among them.
    Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20.

    Args:
        language_model (required): An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template (required): An instance of `PromptTemplate` to embed the input for the evaluator.
        valid_score_range (required): A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        batch_size: The batch size for the evaluator.
        system_message: A system message to be prepended to the input for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.
        prob_threshold: For considering low probability among all of valid scores,
            return None (invalid) if sum of the all probability among vaild scores is less than this value.


    Examples:
        >>> from flexeval import ChatLLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
        >>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nOutput only a number from 1 to 5."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> system_message = "This is the system message."
        >>> llm_score = ChatLLMGEvalScore(language_model, prompt_template, [1, 5], system_message=system_message)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> llm_score.evaluate(lm_outputs)
        MetricResult(
            summary={'llm_geval_score': 1.179980414173022, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_geval_score': 1.1509989197179789,
                    'llm_geval_score_input': [
                        {'role': 'system', 'content': 'This is the system message.'},
                        {'role': 'user', 'content': 'Evaluate the quality of this text...'}
                    ],
                    'llm_geval_score_logprobs': {
                        '1': -0.06977498531341553,
                        '2': -3.687819004058838,
                        '3': -3.937819480895996,
                        '4': -5.812800884246826,
                        '5': -3.937807083129883
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.932603645815178,
                        2: 0.02502652531327666,
                        3: 0.01949066821765914,
                        4: 0.002989046364034347,
                        5: 0.019490909859903
                    }
                },
                {
                    'llm_geval_score': 1.208961908628065,
                    'llm_geval_score_input': [
                        {'role': 'system', 'content': 'This is the system message.'},
                        {'role': 'user', 'content': 'Evaluate the quality of this text...'}
                    ],
                    'llm_geval_score_logprobs': {
                        '1': -0.13043057918548584,
                        '2': -2.8754935264587402,
                        '3': -3.000467538833618,
                        '4': -4.750283241271973,
                        '5': -5.000345706939697
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.8777174226922144,
                        2: 0.05638830351569556,
                        3: 0.04976379642068341,
                        4: 0.008649245032977617,
                        5: 0.006735618046639277
                    }
                }
            ])
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        valid_score_range: tuple[int, int],
        batch_size: int = 4,
        system_message: str | PromptTemplate | None = None,
        disable_tqdm: bool = False,
        category_key: str | None = None,
        prob_threshold: float = 0,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.batch_size = batch_size
        self.system_message = system_message
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key
        self.prob_threshold = prob_threshold

        self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        # Compute metrics
        evaluator_input_list = prepare_chat_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
        )
        evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
            evaluator_input_list,
            self.language_model,
            self.valid_labels,
            self.batch_size,
            self.disable_tqdm,
            "Calculating logprobs",
        )

        evaluator_score_list: list[int | None] = []
        evaluator_probs_list: list[dict[int, float]] = []
        for evaluator_logprobs in evaluator_logprobs_list:
            evaluator_score, evaluator_probs = calculate_weighted_average(
                evaluator_logprobs,
                self.valid_score_range,
                self.prob_threshold,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
            evaluator_score_list.append(evaluator_score)
            evaluator_probs_list.append(evaluator_probs)

        summary = summarize_evaluator_geval_scores(
            evaluator_score_list,
            extra_info_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_geval_score": eval_score,
                    "llm_geval_score_input": eval_in,
                    "llm_geval_score_logprobs": eval_logprobs,
                    "llm_geval_score_generation_probs": eval_probs,
                }
                for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_logprobs_list,
                    evaluator_probs_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model `instance-attribute` ¶

language_model = language_model

prompt_template `instance-attribute` ¶

prompt_template = prompt_template

batch_size `instance-attribute` ¶

batch_size = batch_size

system_message `instance-attribute` ¶

system_message = system_message

disable_tqdm `instance-attribute` ¶

disable_tqdm = disable_tqdm

valid_score_range `instance-attribute` ¶

valid_score_range = valid_score_range

category_key `instance-attribute` ¶

category_key = category_key

prob_threshold `instance-attribute` ¶

prob_threshold = prob_threshold

valid_labels `instance-attribute` ¶

valid_labels = [
    str(score)
    for score in range(
        valid_score_range[0], valid_score_range[1] + 1
    )
]

init ¶

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    system_message: str | PromptTemplate | None = None,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None

Source code in flexeval/core/metric/llm_geval_score.py

def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    system_message: str | PromptTemplate | None = None,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.batch_size = batch_size
    self.system_message = system_message
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key
    self.prob_threshold = prob_threshold

    self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/llm_geval_score.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    # Compute metrics
    evaluator_input_list = prepare_chat_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
    )
    evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
        evaluator_input_list,
        self.language_model,
        self.valid_labels,
        self.batch_size,
        self.disable_tqdm,
        "Calculating logprobs",
    )

    evaluator_score_list: list[int | None] = []
    evaluator_probs_list: list[dict[int, float]] = []
    for evaluator_logprobs in evaluator_logprobs_list:
        evaluator_score, evaluator_probs = calculate_weighted_average(
            evaluator_logprobs,
            self.valid_score_range,
            self.prob_threshold,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
        evaluator_score_list.append(evaluator_score)
        evaluator_probs_list.append(evaluator_probs)

    summary = summarize_evaluator_geval_scores(
        evaluator_score_list,
        extra_info_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_geval_score": eval_score,
                "llm_geval_score_input": eval_in,
                "llm_geval_score_logprobs": eval_logprobs,
                "llm_geval_score_generation_probs": eval_probs,
            }
            for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_logprobs_list,
                evaluator_probs_list,
            )
        ],
    )

repr ¶

__repr__() -> str

Source code in flexeval/core/metric/llm_geval_score.py

def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

LLMGEvalScore ¶

Let LanguageModel evaluate the output of another LanguageModel. Unlike LLMScore, this metric let the model output logprobs for all valid scores and calculate weighted score among them. Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20. For detail, see https://aclanthology.org/2023.emnlp-main.153/

You can specify the evaluation criteria in PromptTemplate.

Parameters:

language_model (required) –

An instance of LanguageModel to evaluate the output of the model.
prompt_template (required) –

An instance of PromptTemplate to embed the input for the evaluator.
valid_score_range (required) –

A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.
batch_size (int, default: 4 ) –

The batch size for the evaluator.
disable_tqdm (bool, default: False ) –

Whether to disable the progress bar.
category_key (str | None, default: None ) –

A key to create category-wise mean score. The category key is expected to be in extra_info.
prob_threshold (float, default: 0 ) –

For considering low probability among all of valid scores, return None (invalid) if sum of the all probability among vaild scores is less than this value.

Examples:

>>> from flexeval import LLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
>>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nOutput only a number from 1 to 5."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> llm_score = LLMGEvalScore(language_model, prompt_template, [1, 5])
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> llm_score.evaluate(lm_outputs)
MetricResult(
    summary={'llm_geval_score': 1.4399980931290486, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_geval_score': 1.418920817254956,
            'llm_geval_score_input': 'Evaluate the quality of this text...',
            'llm_geval_score_logprobs': {
                '1': -4.0625,
                '2': -7.75,
                '3': -8.25,
                '4': -8.0625,
                '5': -6.4375
            },
            'llm_geval_score_generation_probs': {
                1: 0.017205950425851383,
                2: 0.00043074254057568753,
                3: 0.00026125855730166754,
                4: 0.000315137974737356,
                5: 0.0016004026902445643
            }
        },
        {
            'llm_geval_score': 1.461075369003141
            'llm_geval_score_input': 'Evaluate the quality of this text...',
            'llm_geval_score_logprobs': {
                '1': -4.25,
                '2': -8.1875,
                '3': -8.375,
                '4': -8.125,
                '5': -6.5
            },
            'llm_geval_score_generation_probs': {
                1: 0.014264233908999256,
                2: 0.00027810828659249914,
                3: 0.00023055986759244163,
                4: 0.0002960447300568554,
                5: 0.0015034391929775724
            }
        }
    ]
)

Source code in flexeval/core/metric/llm_geval_score.py

class LLMGEvalScore(Metric):
    """Let LanguageModel evaluate the output of another LanguageModel.
    Unlike LLMScore, this metric let the model output logprobs for all valid scores and
    calculate weighted score among them.
    Note that due to constraint for OpenAI models, the number of valid scores must not exceed 20.
    For detail, see https://aclanthology.org/2023.emnlp-main.153/

    You can specify the evaluation criteria in `PromptTemplate`.

    Args:
        language_model (required): An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template (required): An instance of `PromptTemplate` to embed the input for the evaluator.
        valid_score_range (required): A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.
        prob_threshold: For considering low probability among all of valid scores,
            return None (invalid) if sum of the all probability among vaild scores is less than this value.

    Examples:
        >>> from flexeval import LLMGEvalScore, HuggingFaceLM, Jinja2PromptTemplate
        >>> language_model = HuggingFaceLM("Qwen/Qwen2.5-0.5B-Instruct")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nOutput only a number from 1 to 5."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> llm_score = LLMGEvalScore(language_model, prompt_template, [1, 5])
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> llm_score.evaluate(lm_outputs)
        MetricResult(
            summary={'llm_geval_score': 1.4399980931290486, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_geval_score': 1.418920817254956,
                    'llm_geval_score_input': 'Evaluate the quality of this text...',
                    'llm_geval_score_logprobs': {
                        '1': -4.0625,
                        '2': -7.75,
                        '3': -8.25,
                        '4': -8.0625,
                        '5': -6.4375
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.017205950425851383,
                        2: 0.00043074254057568753,
                        3: 0.00026125855730166754,
                        4: 0.000315137974737356,
                        5: 0.0016004026902445643
                    }
                },
                {
                    'llm_geval_score': 1.461075369003141
                    'llm_geval_score_input': 'Evaluate the quality of this text...',
                    'llm_geval_score_logprobs': {
                        '1': -4.25,
                        '2': -8.1875,
                        '3': -8.375,
                        '4': -8.125,
                        '5': -6.5
                    },
                    'llm_geval_score_generation_probs': {
                        1: 0.014264233908999256,
                        2: 0.00027810828659249914,
                        3: 0.00023055986759244163,
                        4: 0.0002960447300568554,
                        5: 0.0015034391929775724
                    }
                }
            ]
        )
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        valid_score_range: tuple[int, int],
        batch_size: int = 4,
        disable_tqdm: bool = False,
        category_key: str | None = None,
        prob_threshold: float = 0,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key
        self.prob_threshold = prob_threshold

        self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template
        )
        evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
            evaluator_input_list,
            self.language_model,
            self.valid_labels,
            self.batch_size,
            self.disable_tqdm,
            "Calculating logprobs",
        )

        evaluator_score_list: list[int | None] = []
        evaluator_probs_list: list[dict[int, float]] = []
        for evaluator_logprobs in evaluator_logprobs_list:
            evaluator_score, evaluator_probs = calculate_weighted_average(
                evaluator_logprobs,
                self.valid_score_range,
                self.prob_threshold,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
            evaluator_score_list.append(evaluator_score)
            evaluator_probs_list.append(evaluator_probs)

        summary = summarize_evaluator_geval_scores(
            evaluator_score_list,
            extra_info_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_geval_score": eval_score,
                    "llm_geval_score_input": eval_in,
                    "llm_geval_score_logprobs": eval_logprobs,
                    "llm_geval_score_generation_probs": eval_probs,
                }
                for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_logprobs_list,
                    evaluator_probs_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model `instance-attribute` ¶

language_model = language_model

prompt_template `instance-attribute` ¶

prompt_template = prompt_template

batch_size `instance-attribute` ¶

batch_size = batch_size

disable_tqdm `instance-attribute` ¶

disable_tqdm = disable_tqdm

valid_score_range `instance-attribute` ¶

valid_score_range = valid_score_range

category_key `instance-attribute` ¶

category_key = category_key

prob_threshold `instance-attribute` ¶

prob_threshold = prob_threshold

valid_labels `instance-attribute` ¶

valid_labels = [
    str(score)
    for score in range(
        valid_score_range[0], valid_score_range[1] + 1
    )
]

init ¶

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None

Source code in flexeval/core/metric/llm_geval_score.py

def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    valid_score_range: tuple[int, int],
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
    prob_threshold: float = 0,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key
    self.prob_threshold = prob_threshold

    self.valid_labels = [str(score) for score in range(valid_score_range[0], valid_score_range[1] + 1)]

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/llm_geval_score.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template
    )
    evaluator_logprobs_list: list[dict[str, float]] = generate_evaluation_logprobs(
        evaluator_input_list,
        self.language_model,
        self.valid_labels,
        self.batch_size,
        self.disable_tqdm,
        "Calculating logprobs",
    )

    evaluator_score_list: list[int | None] = []
    evaluator_probs_list: list[dict[int, float]] = []
    for evaluator_logprobs in evaluator_logprobs_list:
        evaluator_score, evaluator_probs = calculate_weighted_average(
            evaluator_logprobs,
            self.valid_score_range,
            self.prob_threshold,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator logprobs: {evaluator_logprobs}")
        evaluator_score_list.append(evaluator_score)
        evaluator_probs_list.append(evaluator_probs)

    summary = summarize_evaluator_geval_scores(
        evaluator_score_list,
        extra_info_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_geval_score": eval_score,
                "llm_geval_score_input": eval_in,
                "llm_geval_score_logprobs": eval_logprobs,
                "llm_geval_score_generation_probs": eval_probs,
            }
            for eval_score, eval_in, eval_logprobs, eval_probs in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_logprobs_list,
                evaluator_probs_list,
            )
        ],
    )

repr ¶

__repr__() -> str

Source code in flexeval/core/metric/llm_geval_score.py

def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

ChatLLMLabel ¶

A metric that evaluates the output of LanguageModel.batch_generate_chat_response.

Parameters:

language_model (LanguageModel) –

An instance of LanguageModel to evaluate the output of the model.
prompt_template (PromptTemplate) –

An instance of PromptTemplate to embed the input for the evaluator.
label_names (list[str]) –

A list of valid label names.
label_points (list[float | int] | None, default: None ) –

A list of points for each label specified in label_names.
system_message (str | PromptTemplate | None, default: None ) –

A system message to be prepended to the input for the evaluator.
batch_size (int, default: 4 ) –

The batch size for the evaluator.
disable_tqdm (bool, default: False ) –

Whether to disable the progress bar.
category_key (str | None, default: None ) –

A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
>>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text on a scale of Good/Bad.\n`{{ lm_output }}`\nPut the label at the end like [[Good]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> system_message = "This is the system message."
>>> label_names = ["Good", "Bad"]
>>> label_points = [1.0, 0.0]
>>> llm_label = ChatLLMLabel(language_model, prompt_template, label_names, label_points)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> result = llm_label.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_label': 'Good',
            'llm_score': 1.0,
            'llm_label_input': 'Evaluate the quality of this text...',
            'llm_label_output': 'This text is natural, ... [[Good]]'
        },
        {
            'llm_label': 'Bad',
            'llm_score': 0.0,
            'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\n`Good mrrrning!`\nPut the label at the end like [[Good]].',
            'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
        }
    ]
)

Source code in flexeval/core/metric/llm_label.py

class ChatLLMLabel(Metric):
    """
    A metric that evaluates the output of `LanguageModel.batch_generate_chat_response`.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        label_names: A list of valid label names.
        label_points: A list of points for each label specified in label_names.
        system_message: A system message to be prepended to the input for the evaluator.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
        >>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text on a scale of Good/Bad.\\n`{{ lm_output }}`\\nPut the label at the end like [[Good]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> system_message = "This is the system message."
        >>> label_names = ["Good", "Bad"]
        >>> label_points = [1.0, 0.0]
        >>> llm_label = ChatLLMLabel(language_model, prompt_template, label_names, label_points)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> result = llm_label.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_label': 'Good',
                    'llm_score': 1.0,
                    'llm_label_input': 'Evaluate the quality of this text...',
                    'llm_label_output': 'This text is natural, ... [[Good]]'
                },
                {
                    'llm_label': 'Bad',
                    'llm_score': 0.0,
                    'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\\n`Good mrrrning!`\\nPut the label at the end like [[Good]].',
                    'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
                }
            ]
        )
    """  # noqa: E501

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        label_names: list[str],
        label_points: list[float | int] | None = None,
        system_message: str | PromptTemplate | None = None,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.label_names = [re.escape(label) for label in label_names]

        if label_points:
            if len(self.label_names) != len(label_points):
                msg = "The lengths of label_names and weights do not match."
                raise ValueError(msg)
            label_points: list[float] = list(map(float, label_points))
        else:
            label_points = [0.0] * len(label_names)
            label_points[0] = 1.0

        self.weights = label_points
        self.system_message = system_message
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        evaluator_input_list = prepare_chat_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
        )

        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
        )

        evaluator_label_list: list[str] = []
        for evaluator_output in evaluator_output_list:
            evaluator_label = parse_label_from_evaluator_output(
                evaluator_output.text,
                label_names=self.label_names,
            )
            if evaluator_label is None:
                logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
            evaluator_label_list.append(evaluator_label)

        label2point = dict(zip(self.label_names, self.weights))
        evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

        summary = summarize_evaluator_labels(
            evaluator_label_list,
            extra_info_list,
            self.label_names,
            self.weights,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_label": eval_label,
                    "llm_score": eval_score,
                    "llm_label_input": eval_in,
                    "llm_label_output": eval_out.text,
                }
                for eval_label, eval_score, eval_in, eval_out in zip(
                    evaluator_label_list,
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model `instance-attribute` ¶

language_model = language_model

prompt_template `instance-attribute` ¶

prompt_template = prompt_template

label_names `instance-attribute` ¶

label_names = [escape(label) for label in label_names]

weights `instance-attribute` ¶

weights = label_points

system_message `instance-attribute` ¶

system_message = system_message

batch_size `instance-attribute` ¶

batch_size = batch_size

disable_tqdm `instance-attribute` ¶

disable_tqdm = disable_tqdm

category_key `instance-attribute` ¶

category_key = category_key

init ¶

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
) -> None

Source code in flexeval/core/metric/llm_label.py

def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.label_names = [re.escape(label) for label in label_names]

    if label_points:
        if len(self.label_names) != len(label_points):
            msg = "The lengths of label_names and weights do not match."
            raise ValueError(msg)
        label_points: list[float] = list(map(float, label_points))
    else:
        label_points = [0.0] * len(label_names)
        label_points[0] = 1.0

    self.weights = label_points
    self.system_message = system_message
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.category_key = category_key

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/llm_label.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    evaluator_input_list = prepare_chat_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
    )

    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
    )

    evaluator_label_list: list[str] = []
    for evaluator_output in evaluator_output_list:
        evaluator_label = parse_label_from_evaluator_output(
            evaluator_output.text,
            label_names=self.label_names,
        )
        if evaluator_label is None:
            logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
        evaluator_label_list.append(evaluator_label)

    label2point = dict(zip(self.label_names, self.weights))
    evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

    summary = summarize_evaluator_labels(
        evaluator_label_list,
        extra_info_list,
        self.label_names,
        self.weights,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_label": eval_label,
                "llm_score": eval_score,
                "llm_label_input": eval_in,
                "llm_label_output": eval_out.text,
            }
            for eval_label, eval_score, eval_in, eval_out in zip(
                evaluator_label_list,
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

repr ¶

__repr__() -> str

Source code in flexeval/core/metric/llm_label.py

def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

LLMLabel ¶

Let LanguageModel to evaluate the output of another LanguageModel.

You can specify the evaluation criteria in PromptTemplate. The last label value found in the output of the evaluator is used to compute the evaluation score. You can assign a score to each label. The final output is the average score and the distribution of the labels.

Parameters:

language_model (LanguageModel) –

An instance of LanguageModel to evaluate the output of the model.
prompt_template (PromptTemplate) –

An instance of PromptTemplate to embed the input for the evaluator.
label_names (list[str]) –

A list of valid label names.
label_points (list[float | int] | None, default: None ) –

A list of points for each label specified in label_names.
batch_size (int, default: 4 ) –

The batch size for the evaluator.
disable_tqdm (bool, default: False ) –

Whether to disable the progress bar.
category_key (str | None, default: None ) –

A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import OpenAIChatAPI, Jinja2PromptTemplate, LLMLabel
>>> language_model = OpenAIChatAPI(model="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text on a scale of Good/Bad.\n`{{ lm_output }}`\nPut the label at the end like [[Good]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> label_names = ["Good", "Bad"]
>>> label_points = [1.0, 0.0]
>>> llm_label = LLMLabel(language_model, prompt_template, label_names, label_points)
>>> lm_outputs = ["Hello, world!", "Good mrrrning!"]
>>> result = llm_label.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_label': 'Good',
            'llm_score': 1.0,
            'llm_label_input': 'Evaluate the quality of this text...',
            'llm_label_output': 'This text is natural, ... [[Good]]'
        },
        {
            'llm_label': 'Bad',
            'llm_score': 0.0,
            'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\n`Good mrrrning!`\nPut the label at the end like [[Good]].',
            'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
        }
    ]
)

Source code in flexeval/core/metric/llm_label.py

class LLMLabel(Metric):
    """Let LanguageModel to evaluate the output of another LanguageModel.

    You can specify the evaluation criteria in `PromptTemplate`.
    The last label value found in the output of the evaluator is used to compute the evaluation score.
    You can assign a score to each label.
    The final output is the average score and the distribution of the labels.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        label_names: A list of valid label names.
        label_points: A list of points for each label specified in label_names.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import OpenAIChatAPI, Jinja2PromptTemplate, LLMLabel
        >>> language_model = OpenAIChatAPI(model="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text on a scale of Good/Bad.\\n`{{ lm_output }}`\\nPut the label at the end like [[Good]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> label_names = ["Good", "Bad"]
        >>> label_points = [1.0, 0.0]
        >>> llm_label = LLMLabel(language_model, prompt_template, label_names, label_points)
        >>> lm_outputs = ["Hello, world!", "Good mrrrning!"]
        >>> result = llm_label.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 0.5, 'llm_label_distribution': {'Good': 0.5, 'Bad': 0.5}, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_label': 'Good',
                    'llm_score': 1.0,
                    'llm_label_input': 'Evaluate the quality of this text...',
                    'llm_label_output': 'This text is natural, ... [[Good]]'
                },
                {
                    'llm_label': 'Bad',
                    'llm_score': 0.0,
                    'llm_label_input': 'Evaluate the quality of this text on a scale of Good/Bad.\\n`Good mrrrning!`\\nPut the label at the end like [[Good]].',
                    'llm_label_output': 'This text contains a spelling error, ... [[Bad]]'
                }
            ]
        )
    """  # noqa: E501

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        label_names: list[str],
        label_points: list[float | int] | None = None,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        valid_score_range: tuple[int, int] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.label_names = [re.escape(label) for label in label_names]

        if label_points:
            if len(self.label_names) != len(label_points):
                msg = "The lengths of label_names and weights do not match."
                raise ValueError(msg)
            label_points: list[float] = list(map(float, label_points))
        else:
            label_points = [0.0] * len(label_names)
            label_points[0] = 1.0

        self.weights = label_points
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template
        )
        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
        )

        evaluator_label_list: list[int | None] = []
        for evaluator_output in evaluator_output_list:
            evaluator_label = parse_label_from_evaluator_output(
                evaluator_output.text,
                label_names=self.label_names,
            )
            if evaluator_label is None:
                logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
            evaluator_label_list.append(evaluator_label)

        label2point = dict(zip(self.label_names, self.weights))
        evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

        summary = summarize_evaluator_labels(
            evaluator_label_list,
            extra_info_list,
            self.label_names,
            self.weights,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {
                    "llm_label": eval_label,
                    "llm_score": eval_score,
                    "llm_label_input": eval_in,
                    "llm_label_output": eval_out.text,
                }
                for eval_label, eval_score, eval_in, eval_out in zip(
                    evaluator_label_list,
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model `instance-attribute` ¶

language_model = language_model

prompt_template `instance-attribute` ¶

prompt_template = prompt_template

label_names `instance-attribute` ¶

label_names = [escape(label) for label in label_names]

weights `instance-attribute` ¶

weights = label_points

batch_size `instance-attribute` ¶

batch_size = batch_size

disable_tqdm `instance-attribute` ¶

disable_tqdm = disable_tqdm

valid_score_range `instance-attribute` ¶

valid_score_range = valid_score_range

category_key `instance-attribute` ¶

category_key = category_key

init ¶

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None

Source code in flexeval/core/metric/llm_label.py

def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    label_names: list[str],
    label_points: list[float | int] | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.label_names = [re.escape(label) for label in label_names]

    if label_points:
        if len(self.label_names) != len(label_points):
            msg = "The lengths of label_names and weights do not match."
            raise ValueError(msg)
        label_points: list[float] = list(map(float, label_points))
    else:
        label_points = [0.0] * len(label_names)
        label_points[0] = 1.0

    self.weights = label_points
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/llm_label.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template
    )
    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
    )

    evaluator_label_list: list[int | None] = []
    for evaluator_output in evaluator_output_list:
        evaluator_label = parse_label_from_evaluator_output(
            evaluator_output.text,
            label_names=self.label_names,
        )
        if evaluator_label is None:
            logger.warning(f"Failed to parse label from evaluator output: {evaluator_output}")
        evaluator_label_list.append(evaluator_label)

    label2point = dict(zip(self.label_names, self.weights))
    evaluator_score_list: list[float | None] = [label2point.get(label) for label in evaluator_label_list]

    summary = summarize_evaluator_labels(
        evaluator_label_list,
        extra_info_list,
        self.label_names,
        self.weights,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {
                "llm_label": eval_label,
                "llm_score": eval_score,
                "llm_label_input": eval_in,
                "llm_label_output": eval_out.text,
            }
            for eval_label, eval_score, eval_in, eval_out in zip(
                evaluator_label_list,
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

repr ¶

__repr__() -> str

Source code in flexeval/core/metric/llm_label.py

def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

ChatLLMScore ¶

A metric that evaluates the output of LanguageModel.batch_generate_chat_response.

Parameters:

language_model (LanguageModel) –

An instance of LanguageModel to evaluate the output of the model.
prompt_template (PromptTemplate) –

An instance of PromptTemplate to embed the input for the evaluator.
system_message (str | PromptTemplate | None, default: None ) –

A system message to be prepended to the input for the evaluator.
batch_size (int, default: 4 ) –

The batch size for the evaluator.
disable_tqdm (bool, default: False ) –

Whether to disable the progress bar.
valid_score_range (tuple[int, int] | None, default: None ) –

A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.
category_key (str | None, default: None ) –

A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
>>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nPut the score at the end like [[5]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> system_message = "This is the system message."
>>> llm_score = ChatLLMScore(language_model, prompt_template, system_message)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> result = llm_score.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_score': 2,
            'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
            'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
        {
            'llm_score': 4,
            'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
            'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
    ]
)

Source code in flexeval/core/metric/llm_score.py

class ChatLLMScore(Metric):
    """
    A metric that evaluates the output of `LanguageModel.batch_generate_chat_response`.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        system_message: A system message to be prepended to the input for the evaluator.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        valid_score_range: A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import ChatLLMScore, OpenAIChatAPI, Jinja2PromptTemplate
        >>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nPut the score at the end like [[5]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> system_message = "This is the system message."
        >>> llm_score = ChatLLMScore(language_model, prompt_template, system_message)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> result = llm_score.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_score': 2,
                    'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
                    'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
                {
                    'llm_score': 4,
                    'llm_score_input': [{'role': 'user', 'content': 'Evaluate the quality of this text...'}],
                    'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
            ]
        )
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        system_message: str | PromptTemplate | None = None,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        valid_score_range: tuple[int, int] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.system_message = system_message
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        # Compute metrics
        evaluator_input_list = prepare_chat_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
        )
        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
        )

        evaluator_score_list: list[int] = []
        for evaluator_output in evaluator_output_list:
            evaluator_score = parse_score_from_evaluator_output(
                evaluator_output.text,
                valid_score_range=self.valid_score_range,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
            evaluator_score_list.append(evaluator_score)

        summary = summarize_evaluator_scores(
            evaluator_score_list,
            extra_info_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
                for eval_score, eval_in, eval_out in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model `instance-attribute` ¶

language_model = language_model

prompt_template `instance-attribute` ¶

prompt_template = prompt_template

system_message `instance-attribute` ¶

system_message = system_message

batch_size `instance-attribute` ¶

batch_size = batch_size

disable_tqdm `instance-attribute` ¶

disable_tqdm = disable_tqdm

valid_score_range `instance-attribute` ¶

valid_score_range = valid_score_range

category_key `instance-attribute` ¶

category_key = category_key

init ¶

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None

Source code in flexeval/core/metric/llm_score.py

def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    system_message: str | PromptTemplate | None = None,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.system_message = system_message
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/llm_score.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    # Compute metrics
    evaluator_input_list = prepare_chat_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template, self.system_message
    )
    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating ChatLLM score"
    )

    evaluator_score_list: list[int] = []
    for evaluator_output in evaluator_output_list:
        evaluator_score = parse_score_from_evaluator_output(
            evaluator_output.text,
            valid_score_range=self.valid_score_range,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
        evaluator_score_list.append(evaluator_score)

    summary = summarize_evaluator_scores(
        evaluator_score_list,
        extra_info_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
            for eval_score, eval_in, eval_out in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

repr ¶

__repr__() -> str

Source code in flexeval/core/metric/llm_score.py

def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

LLMScore ¶

Let LanguageModel to evaluate the output of another LanguageModel.

You can specify the evaluation criteria in PromptTemplate. The last integer value in the output of the evaluator is used as the evaluation score.

Parameters:

language_model (LanguageModel) –

An instance of LanguageModel to evaluate the output of the model.
prompt_template (PromptTemplate) –

An instance of PromptTemplate to embed the input for the evaluator.
batch_size (int, default: 4 ) –

The batch size for the evaluator.
disable_tqdm (bool, default: False ) –

Whether to disable the progress bar.
valid_score_range (tuple[int, int] | None, default: None ) –

A tuple of two integers representing the valid score range. If the parsed score is out of the range, it will be ignored.
category_key (str | None, default: None ) –

A key to create category-wise mean score. The category key is expected to be in extra_info.

Examples:

>>> from flexeval import LLMScore, OpenAIChatAPI, Jinja2PromptTemplate
>>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
>>> template = "Evaluate the quality of this text.\n`{{ lm_output }}`\nPut the score at the end like [[5]]."
>>> prompt_template = Jinja2PromptTemplate(template)
>>> llm_score = LLMScore(language_model, prompt_template)
>>> lm_outputs = ["Hello, world!", "Good morning!"]
>>> result = llm_score.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
    instance_details=[
        {
            'llm_score': 2,
            'llm_score_input': 'Evaluate the quality of this text...',
            'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
        {
            'llm_score': 4,
            'llm_score_input': 'Evaluate the quality of this text...',
            'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
    ]
)

Source code in flexeval/core/metric/llm_score.py

class LLMScore(Metric):
    """Let LanguageModel to evaluate the output of another LanguageModel.

    You can specify the evaluation criteria in `PromptTemplate`.
    The last integer value in the output of the evaluator is used as the evaluation score.

    Args:
        language_model: An instance of `LanguageModel` to evaluate the output of the model.
        prompt_template: An instance of `PromptTemplate` to embed the input for the evaluator.
        batch_size: The batch size for the evaluator.
        disable_tqdm: Whether to disable the progress bar.
        valid_score_range: A tuple of two integers representing the valid score range.
            If the parsed score is out of the range, it will be ignored.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.

    Examples:
        >>> from flexeval import LLMScore, OpenAIChatAPI, Jinja2PromptTemplate
        >>> language_model = OpenAIChatAPI(model_name="gpt-3.5-turbo")
        >>> template = "Evaluate the quality of this text.\\n`{{ lm_output }}`\\nPut the score at the end like [[5]]."
        >>> prompt_template = Jinja2PromptTemplate(template)
        >>> llm_score = LLMScore(language_model, prompt_template)
        >>> lm_outputs = ["Hello, world!", "Good morning!"]
        >>> result = llm_score.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'llm_score': 3.0, 'num_failed_score_parses': 0},
            instance_details=[
                {
                    'llm_score': 2,
                    'llm_score_input': 'Evaluate the quality of this text...',
                    'llm_score_output': 'This text is very simple,... Therefore, its quality is average. [[2]]'},
                {
                    'llm_score': 4,
                    'llm_score_input': 'Evaluate the quality of this text...',
                    'llm_score_output': '... Overall, the quality of the text is good but basic. [[4]]'}
            ]
        )
    """

    def __init__(
        self,
        language_model: LanguageModel,
        prompt_template: PromptTemplate,
        batch_size: int = 4,
        disable_tqdm: bool = False,
        valid_score_range: tuple[int, int] | None = None,
        category_key: str | None = None,
    ) -> None:
        self.language_model = language_model
        self.prompt_template = prompt_template
        self.batch_size = batch_size
        self.disable_tqdm = disable_tqdm
        self.valid_score_range = valid_score_range
        self.category_key = category_key

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        if extra_info_list is None:
            extra_info_list = [{} for _ in lm_outputs]
        if references_list is None:
            references_list = [[] for _ in lm_outputs]

        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
            lm_outputs, references_list, extra_info_list, self.prompt_template
        )
        evaluator_output_list: list[LMOutput] = generate_evaluations(
            evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
        )

        evaluator_score_list: list[int | None] = []
        for evaluator_output in evaluator_output_list:
            evaluator_score = parse_score_from_evaluator_output(
                evaluator_output.text,
                valid_score_range=self.valid_score_range,
            )
            if evaluator_score is None:
                logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
            evaluator_score_list.append(evaluator_score)

        summary = summarize_evaluator_scores(
            evaluator_score_list,
            extra_info_list,
            self.category_key,
        )

        return MetricResult(
            summary,
            instance_details=[
                {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
                for eval_score, eval_in, eval_out in zip(
                    evaluator_score_list,
                    evaluator_input_list,
                    evaluator_output_list,
                )
            ],
        )

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
        )

language_model `instance-attribute` ¶

language_model = language_model

prompt_template `instance-attribute` ¶

prompt_template = prompt_template

batch_size `instance-attribute` ¶

batch_size = batch_size

disable_tqdm `instance-attribute` ¶

disable_tqdm = disable_tqdm

valid_score_range `instance-attribute` ¶

valid_score_range = valid_score_range

category_key `instance-attribute` ¶

category_key = category_key

init ¶

__init__(
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None

Source code in flexeval/core/metric/llm_score.py

def __init__(
    self,
    language_model: LanguageModel,
    prompt_template: PromptTemplate,
    batch_size: int = 4,
    disable_tqdm: bool = False,
    valid_score_range: tuple[int, int] | None = None,
    category_key: str | None = None,
) -> None:
    self.language_model = language_model
    self.prompt_template = prompt_template
    self.batch_size = batch_size
    self.disable_tqdm = disable_tqdm
    self.valid_score_range = valid_score_range
    self.category_key = category_key

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/llm_score.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    if extra_info_list is None:
        extra_info_list = [{} for _ in lm_outputs]
    if references_list is None:
        references_list = [[] for _ in lm_outputs]

    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    evaluator_input_list: list[str] = prepare_text_input_for_evaluator(
        lm_outputs, references_list, extra_info_list, self.prompt_template
    )
    evaluator_output_list: list[LMOutput] = generate_evaluations(
        evaluator_input_list, self.language_model, self.batch_size, self.disable_tqdm, "Calculating LLM score"
    )

    evaluator_score_list: list[int | None] = []
    for evaluator_output in evaluator_output_list:
        evaluator_score = parse_score_from_evaluator_output(
            evaluator_output.text,
            valid_score_range=self.valid_score_range,
        )
        if evaluator_score is None:
            logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")
        evaluator_score_list.append(evaluator_score)

    summary = summarize_evaluator_scores(
        evaluator_score_list,
        extra_info_list,
        self.category_key,
    )

    return MetricResult(
        summary,
        instance_details=[
            {"llm_score": eval_score, "llm_score_input": eval_in, "llm_score_output": eval_out.text}
            for eval_score, eval_in, eval_out in zip(
                evaluator_score_list,
                evaluator_input_list,
                evaluator_output_list,
            )
        ],
    )

repr ¶

__repr__() -> str

Source code in flexeval/core/metric/llm_score.py

def __repr__(self) -> str:
    return (
        f"{self.__class__.__name__}(language_model={self.language_model}, prompt_template={self.prompt_template})"
    )

OutputLengthStats ¶

Compute statistics on the length of the outputs.

Examples:

>>> from flexeval import OutputLengthStats
>>> output_length_stats = OutputLengthStats()
>>> lm_outputs = ["123456", "123456789"]
>>> result = output_length_stats.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'avg_output_length': 7.5, 'max_output_length': 9, 'min_output_length': 6},
    instance_details=[{'output_length': 6}, {'output_length': 9}]
)

Source code in flexeval/core/metric/output_length_stats.py

class OutputLengthStats(Metric):
    """
    Compute statistics on the length of the outputs.

    Examples:
        >>> from flexeval import OutputLengthStats
        >>> output_length_stats = OutputLengthStats()
        >>> lm_outputs = ["123456", "123456789"]
        >>> result = output_length_stats.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'avg_output_length': 7.5, 'max_output_length': 9, 'min_output_length': 6},
            instance_details=[{'output_length': 6}, {'output_length': 9}]
        )
    """

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        # Compute metrics
        output_length_list = [len(output) for output in lm_outputs]
        return MetricResult(
            {
                "avg_output_length": sum(output_length_list) / len(output_length_list),
                "max_output_length": max(output_length_list),
                "min_output_length": min(output_length_list),
            },
            instance_details=[{"output_length": s} for s in output_length_list],
        )

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/output_length_stats.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    # Compute metrics
    output_length_list = [len(output) for output in lm_outputs]
    return MetricResult(
        {
            "avg_output_length": sum(output_length_list) / len(output_length_list),
            "max_output_length": max(output_length_list),
            "min_output_length": min(output_length_list),
        },
        instance_details=[{"output_length": s} for s in output_length_list],
    )

PerspectiveAPI ¶

A metric that evaluates text outputs using the Perspective API. Please set PERSPECTIVE_API_KEY in the environment variable.

Parameters:

languages (list[str]) –

A list of languages to analyze.

Examples:

>>> from flexeval import PerspectiveAPI
>>> perspective_api = PerspectiveAPI(languages=["en"])
>>> lm_outputs = ["I love you", "I hate you"]
>>> result = perspective_api.evaluate(lm_outputs)
>>> print(result)
MetricResult(
    summary={'TOXICITY': 0.35407552, ..., 'THREAT': 0.0265799825},
    instance_details=[
        {'TOXICITY': 0.02543884, ..., 'THREAT': 0.009204263},
        {'TOXICITY': 0.6827122, ..., 'THREAT': 0.043955702}
        ]
    )

Source code in flexeval/core/metric/perspective_api.py

class PerspectiveAPI(Metric):
    """A metric that evaluates text outputs using the Perspective API.
    Please set `PERSPECTIVE_API_KEY` in the environment variable.

    Args:
        languages: A list of languages to analyze.

    Examples:
        >>> from flexeval import PerspectiveAPI
        >>> perspective_api = PerspectiveAPI(languages=["en"])
        >>> lm_outputs = ["I love you", "I hate you"]
        >>> result = perspective_api.evaluate(lm_outputs)
        >>> print(result)
        MetricResult(
            summary={'TOXICITY': 0.35407552, ..., 'THREAT': 0.0265799825},
            instance_details=[
                {'TOXICITY': 0.02543884, ..., 'THREAT': 0.009204263},
                {'TOXICITY': 0.6827122, ..., 'THREAT': 0.043955702}
                ]
            )
    """

    def __init__(self, languages: list[str]) -> None:
        self.client = discovery.build(
            "commentanalyzer",
            "v1alpha1",
            developerKey=PERSPECTIVE_API_KEY,
            discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
            static_discovery=False,
        )
        self.languages = languages
        self.attributes = ["TOXICITY", "SEVERE_TOXICITY", "IDENTITY_ATTACK", "INSULT", "PROFANITY", "THREAT"]

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]] | None = None,
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        # Compute metrics
        instance_details = []
        for lm_output in lm_outputs:
            if lm_output == "":
                instance_details.append({att: 0.0 for att in self.attributes})
                continue
            analyze_request = {
                "comment": {"text": lm_output},
                "languages": self.languages,
                "requestedAttributes": {att: {} for att in self.attributes},
            }
            response = retry_on_error(perspectiveapi_call=self.client.comments().analyze(body=analyze_request).execute)
            instance_details.append(
                {att: response["attributeScores"][att]["summaryScore"]["value"] for att in self.attributes},
            )
        scores_for_attribute = {att: [] for att in self.attributes}
        for instance in instance_details:
            for att in self.attributes:
                scores_for_attribute[att].append(instance[att])
        average_scores = {att: np.mean(scores_for_attribute[att]) for att in self.attributes}
        return MetricResult(average_scores, instance_details=instance_details)

client `instance-attribute` ¶

client = build(
    "commentanalyzer",
    "v1alpha1",
    developerKey=PERSPECTIVE_API_KEY,
    discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
    static_discovery=False,
)

languages `instance-attribute` ¶

languages = languages

attributes `instance-attribute` ¶

attributes = [
    "TOXICITY",
    "SEVERE_TOXICITY",
    "IDENTITY_ATTACK",
    "INSULT",
    "PROFANITY",
    "THREAT",
]

init ¶

__init__(languages: list[str]) -> None

Source code in flexeval/core/metric/perspective_api.py

def __init__(self, languages: list[str]) -> None:
    self.client = discovery.build(
        "commentanalyzer",
        "v1alpha1",
        developerKey=PERSPECTIVE_API_KEY,
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False,
    )
    self.languages = languages
    self.attributes = ["TOXICITY", "SEVERE_TOXICITY", "IDENTITY_ATTACK", "INSULT", "PROFANITY", "THREAT"]

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/perspective_api.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]] | None = None,
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    # Compute metrics
    instance_details = []
    for lm_output in lm_outputs:
        if lm_output == "":
            instance_details.append({att: 0.0 for att in self.attributes})
            continue
        analyze_request = {
            "comment": {"text": lm_output},
            "languages": self.languages,
            "requestedAttributes": {att: {} for att in self.attributes},
        }
        response = retry_on_error(perspectiveapi_call=self.client.comments().analyze(body=analyze_request).execute)
        instance_details.append(
            {att: response["attributeScores"][att]["summaryScore"]["value"] for att in self.attributes},
        )
    scores_for_attribute = {att: [] for att in self.attributes}
    for instance in instance_details:
        for att in self.attributes:
            scores_for_attribute[att].append(instance[att])
    average_scores = {att: np.mean(scores_for_attribute[att]) for att in self.attributes}
    return MetricResult(average_scores, instance_details=instance_details)

RepetitionCount ¶

A metric that counts the number of repetitions of the most repeated pattern in the model's output.

Parameters:

lm_output_processor (StringProcessor | list[StringProcessor] | None, default: None ) –

StringProcessor or list of Normalizers to apply to the model outputs before analysis.

Examples:

>>> from flexeval import RepetitionCount
>>> repetition_count = RepetitionCount()
>>> lm_outputs = ["hello hello hello hello hello hello hello hello hello hello"]
>>> references_list = [[]]  # Not used for this metric
>>> result = repetition_count.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'repetition_ratio': 1.0},
    instance_details=[{'most_repeated_pattern': 'hello hell', 'repetition_count': 9, 'is_repetition': True}]
)

Source code in flexeval/core/metric/repetition_count.py

class RepetitionCount(Metric):
    """
    A metric that counts the number of repetitions of the most repeated pattern in the model's output.

    Args:
        lm_output_processor: StringProcessor or list of Normalizers to apply to the model outputs before analysis.

    Examples:
        >>> from flexeval import RepetitionCount
        >>> repetition_count = RepetitionCount()
        >>> lm_outputs = ["hello hello hello hello hello hello hello hello hello hello"]
        >>> references_list = [[]]  # Not used for this metric
        >>> result = repetition_count.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'repetition_ratio': 1.0},
            instance_details=[{'most_repeated_pattern': 'hello hell', 'repetition_count': 9, 'is_repetition': True}]
        )
    """

    def __init__(
        self,
        count_threshold: int = 30,
        threshold_length: int = 10,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
    ) -> None:
        self.count_threshold = count_threshold
        self.threshold_length = threshold_length
        self.lm_output_processors = lm_output_processor

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],  # Not used in this metric
        extra_info_list: list[dict[str, str]] | None = None,  # Not used in this metric
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)
        # Normalize text data
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]

        # Compute metrics
        repetition_details: list[dict[str, Any]] = []
        num_repetitions = 0
        for output in lm_outputs:
            most_repeated_pattern, count = get_most_repeated_pattern(output, threshold_length=self.threshold_length)
            is_repetition = count >= self.count_threshold
            repetition_details.append(
                {
                    "most_repeated_pattern": most_repeated_pattern,
                    "repetition_count": count,
                    "is_repetition": is_repetition,
                }
            )
            num_repetitions += int(is_repetition)

        repetition_rate = num_repetitions / len(lm_outputs)

        return MetricResult(
            summary={"repetition_ratio": repetition_rate},
            instance_details=repetition_details,
        )

count_threshold `instance-attribute` ¶

count_threshold = count_threshold

threshold_length `instance-attribute` ¶

threshold_length = threshold_length

lm_output_processors `instance-attribute` ¶

lm_output_processors = lm_output_processor

init ¶

__init__(
    count_threshold: int = 30,
    threshold_length: int = 10,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = None,
) -> None

Source code in flexeval/core/metric/repetition_count.py

def __init__(
    self,
    count_threshold: int = 30,
    threshold_length: int = 10,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
) -> None:
    self.count_threshold = count_threshold
    self.threshold_length = threshold_length
    self.lm_output_processors = lm_output_processor

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/repetition_count.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],  # Not used in this metric
    extra_info_list: list[dict[str, str]] | None = None,  # Not used in this metric
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)
    # Normalize text data
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]

    # Compute metrics
    repetition_details: list[dict[str, Any]] = []
    num_repetitions = 0
    for output in lm_outputs:
        most_repeated_pattern, count = get_most_repeated_pattern(output, threshold_length=self.threshold_length)
        is_repetition = count >= self.count_threshold
        repetition_details.append(
            {
                "most_repeated_pattern": most_repeated_pattern,
                "repetition_count": count,
                "is_repetition": is_repetition,
            }
        )
        num_repetitions += int(is_repetition)

    repetition_rate = num_repetitions / len(lm_outputs)

    return MetricResult(
        summary={"repetition_ratio": repetition_rate},
        instance_details=repetition_details,
    )

ROUGE ¶

An implementation of ROUGE.

The calculation is based on the rouge library.

Parameters:

tokenizer (Tokenizer) –

An instance of Tokenizer to tokenize the input and output strings.

Examples:

>>> from flexeval import ROUGE
>>> from flexeval import WhitespaceTokenizer
>>> tokenizer = WhitespaceTokenizer()
>>> rouge = ROUGE(tokenizer)
>>> lm_outputs = ["I am a student .", "I am a teacher ."]
>>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
>>> result = rouge.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
    instance_details=[
        {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
        {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995}
    ]
)

Source code in flexeval/core/metric/rouge.py

class ROUGE(Metric):
    """An implementation of [ROUGE](https://aclanthology.org/W04-1013/).

    The calculation is based on the [rouge](https://github.com/pltrdy/rouge) library.

    Args:
        tokenizer: An instance of `Tokenizer` to tokenize the input and output strings.

    Examples:
        >>> from flexeval import ROUGE
        >>> from flexeval import WhitespaceTokenizer
        >>> tokenizer = WhitespaceTokenizer()
        >>> rouge = ROUGE(tokenizer)
        >>> lm_outputs = ["I am a student .", "I am a teacher ."]
        >>> references_list = [["I am a student .", "I am a learner ."], ["I am a teacher ."]]
        >>> result = rouge.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
            instance_details=[
                {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995},
                {'rouge1': 0.999999995, 'rouge2': 0.999999995, 'rougeL': 0.999999995}
            ]
        )
    """

    def __init__(self, tokenizer: Tokenizer) -> None:
        self._tokenizer = tokenizer

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data - we only need the first reference
        target_summaries = [references[0] for references in references_list]

        tokenized_lm_outputs = [" ".join(self._tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
        tokenized_target_summaries = [
            " ".join(self._tokenizer.tokenize(target_summary)) for target_summary in target_summaries
        ]

        # replace empty string with " " to avoid "ValueError: Hypothesis is empty" from rouge
        tokenized_lm_outputs = [o if o else " " for o in tokenized_lm_outputs]

        # Compute metrics
        rouge = RougeCalculator()
        score_outputs = rouge.get_scores(
            tokenized_lm_outputs,
            tokenized_target_summaries,
        )

        rouge1_list = [o["rouge-1"]["f"] for o in score_outputs]
        rouge2_list = [o["rouge-2"]["f"] for o in score_outputs]
        rouge_l_list = [o["rouge-l"]["f"] for o in score_outputs]

        # we only need the f1 score
        return MetricResult(
            {
                "rouge1": sum(rouge1_list) / len(rouge1_list),
                "rouge2": sum(rouge2_list) / len(rouge2_list),
                "rougeL": sum(rouge_l_list) / len(rouge_l_list),
            },
            instance_details=[
                {"rouge1": r1, "rouge2": r2, "rougeL": rL} for r1, r2, rL in zip(rouge1_list, rouge2_list, rouge_l_list)
            ],
        )

init ¶

__init__(tokenizer: Tokenizer) -> None

Source code in flexeval/core/metric/rouge.py

def __init__(self, tokenizer: Tokenizer) -> None:
    self._tokenizer = tokenizer

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/rouge.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data - we only need the first reference
    target_summaries = [references[0] for references in references_list]

    tokenized_lm_outputs = [" ".join(self._tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
    tokenized_target_summaries = [
        " ".join(self._tokenizer.tokenize(target_summary)) for target_summary in target_summaries
    ]

    # replace empty string with " " to avoid "ValueError: Hypothesis is empty" from rouge
    tokenized_lm_outputs = [o if o else " " for o in tokenized_lm_outputs]

    # Compute metrics
    rouge = RougeCalculator()
    score_outputs = rouge.get_scores(
        tokenized_lm_outputs,
        tokenized_target_summaries,
    )

    rouge1_list = [o["rouge-1"]["f"] for o in score_outputs]
    rouge2_list = [o["rouge-2"]["f"] for o in score_outputs]
    rouge_l_list = [o["rouge-l"]["f"] for o in score_outputs]

    # we only need the f1 score
    return MetricResult(
        {
            "rouge1": sum(rouge1_list) / len(rouge1_list),
            "rouge2": sum(rouge2_list) / len(rouge2_list),
            "rougeL": sum(rouge_l_list) / len(rouge_l_list),
        },
        instance_details=[
            {"rouge1": r1, "rouge2": r2, "rougeL": rL} for r1, r2, rL in zip(rouge1_list, rouge2_list, rouge_l_list)
        ],
    )

SARI ¶

An implementation of SARI, a metric for evaluating text simplification.

Based on the original implementation [1], modified to allow configurable settings for the maximum n-gram size and tokenizer. Additionally, it fixes a bug present in the original implementation [2]. When used with the default parameters, it produces scores that are consistent with the HuggingFace/evaluate implementation [3].

[1] https://github.com/cocoxu/simplification/blob/master/SARI.py [2] https://github.com/cocoxu/simplification/issues/6 [3] https://huggingface.co/spaces/evaluate-metric/sari/blob/main/sari.py

Parameters:

tokenizer (Tokenizer | Literal['default'], default: 'default' ) –

An instance of Tokenizer to tokenize the input and output strings.
max_ngrams (int, default: 4 ) –

The maximum n-gram order to consider. Defaults to 4.
category_key (str | None, default: None ) –

A key to create category-wise mean score. The category key is expected to be in extra_info.
lm_output_processor (StringProcessor | list[StringProcessor] | None, default: DEFAULT_STRING_PROCESSOR ) –

StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
reference_processor (StringProcessor | list[StringProcessor] | None, default: DEFAULT_STRING_PROCESSOR ) –

StringProcessor or list of StringProcessor to apply to the references before comparison.
source_processor (StringProcessor | list[StringProcessor] | None, default: DEFAULT_STRING_PROCESSOR ) –

StringProcessor or list of StringProcessor to apply to the source sentences before comparison.

Examples:

>>> from flexeval import SARI
>>> sari_scorer = SARI(source_key="source")
>>> lm_outputs = ["About 95 you now get in."]
>>> references_list = [["About 95 species are currently known.", "About 95 species are now accepted.", "95 species are now accepted."]]
>>> extra_info_list = [{"source": "About 95 species are currently accepted."}]
>>> result = sari_scorer.evaluate(lm_outputs, references_list, extra_info_list)
>>> print(result)
MetricResult(
    summary={
        'sari_score': 0.2695360195360195,
        'sari_add': 0.08333333333333333,
        'sari_keep': 0.22527472527472525,
        'sari_del': 0.5
    },
    instance_details=[{'sari_score': 0.2695360195360195, 'sari_add': 0.08333333333333333, 'sari_keep': 0.22527472527472525, 'sari_del': 0.5}]
)

Source code in flexeval/core/metric/sari.py

class SARI(Metric):
    """An implementation of SARI, a metric for evaluating text simplification.

    Based on the original implementation [1], modified to allow configurable settings
    for the maximum n-gram size and tokenizer.
    Additionally, it fixes a bug present in the original implementation [2].
    When used with the default parameters, it produces scores that are
    consistent with the HuggingFace/evaluate implementation [3].

    [1] https://github.com/cocoxu/simplification/blob/master/SARI.py
    [2] https://github.com/cocoxu/simplification/issues/6
    [3] https://huggingface.co/spaces/evaluate-metric/sari/blob/main/sari.py

    Args:
        tokenizer: An instance of `Tokenizer` to tokenize the input and output strings.
        max_ngrams: The maximum n-gram order to consider. Defaults to `4`.
        category_key: A key to create category-wise mean score.
            The category key is expected to be in extra_info.
        lm_output_processor:
            StringProcessor or a list of StringProcessor to be applied to the model outputs before comparison.
        reference_processor: StringProcessor or list of StringProcessor to apply to the references before comparison.
        source_processor: StringProcessor or list of StringProcessor to apply to the source sentences before comparison.

    Examples:
        >>> from flexeval import SARI
        >>> sari_scorer = SARI(source_key="source")
        >>> lm_outputs = ["About 95 you now get in."]
        >>> references_list = [["About 95 species are currently known.", "About 95 species are now accepted.", "95 species are now accepted."]]
        >>> extra_info_list = [{"source": "About 95 species are currently accepted."}]
        >>> result = sari_scorer.evaluate(lm_outputs, references_list, extra_info_list)
        >>> print(result)
        MetricResult(
            summary={
                'sari_score': 0.2695360195360195,
                'sari_add': 0.08333333333333333,
                'sari_keep': 0.22527472527472525,
                'sari_del': 0.5
            },
            instance_details=[{'sari_score': 0.2695360195360195, 'sari_add': 0.08333333333333333, 'sari_keep': 0.22527472527472525, 'sari_del': 0.5}]
        )
    """  # noqa: E501

    def __init__(
        self,
        source_key: str,
        tokenizer: Tokenizer | Literal["default"] = "default",
        max_ngrams: int = 4,
        category_key: str | None = None,
        source_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
        lm_output_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
        reference_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
    ) -> None:
        if tokenizer == "default":
            tokenizer = SacreBleuTokenizer("13a")
        self._tokenizer = tokenizer
        self.source_key = source_key
        self.max_ngrams = max_ngrams
        self.category_key = category_key

        self.source_processors = source_processor
        self.lm_output_processors = lm_output_processor
        self.reference_processors = reference_processor

    def evaluate(self, lm_outputs, references_list, extra_info_list=None) -> MetricResult:  # noqa: ANN001
        validate_inputs(lm_outputs, references_list, extra_info_list)

        if extra_info_list is None:
            msg = "SARI requires extra_info_list"
            raise ValueError(msg)
        sources = [extra_info[self.source_key] for extra_info in extra_info_list]

        # Normalize text data
        sources = [apply_string_processors(src, self.source_processors) for src in sources]
        lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
        references_list = [
            [apply_string_processors(ref, self.reference_processors) for ref in references]
            for references in references_list
        ]

        # Compute metrics
        sari_instance_list = [
            self._calc_sentence_sari(source, lm_output, references)
            for source, lm_output, references in zip(sources, lm_outputs, references_list)
        ]

        metric_name2scores = {
            name: [s[name] for s in sari_instance_list] for name in ["sari_score", "sari_add", "sari_keep", "sari_del"]
        }

        num_instances = len(sari_instance_list)
        summary = {
            metric_name: sum(score_list) / num_instances for metric_name, score_list in metric_name2scores.items()
        }

        if self.category_key:
            categories = [extra_info[self.category_key] for extra_info in extra_info_list]
            for metric_name, score_list in metric_name2scores.items():
                category_wise_scores = aggregate_category_wise_scores(score_list, categories)
                for category, category_wise_score in category_wise_scores.items():
                    summary[f"{metric_name}/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=sari_instance_list,
        )

    def _calc_sentence_sari(self, source: str, lm_output: str, references: list[str]) -> dict[str, float]:
        s_words = self._tokenizer.tokenize(source)
        c_words = self._tokenizer.tokenize(lm_output)
        r_words_list = [self._tokenizer.tokenize(reference) for reference in references]

        sari_score, sari_add, sari_keep, sari_del = 0.0, 0.0, 0.0, 0.0
        for n in range(1, self.max_ngrams + 1):
            s_ngrams = to_ngram(s_words, n)
            c_ngrams = to_ngram(c_words, n)
            r_ngrams_list = [to_ngram(r_words, n) for r_words in r_words_list]

            sari_n_score, sari_n_add, sari_n_keep, sari_n_del = self._sari_n(s_ngrams, c_ngrams, r_ngrams_list)
            sari_score += sari_n_score
            sari_add += sari_n_add
            sari_keep += sari_n_keep
            sari_del += sari_n_del

        sari_score /= self.max_ngrams
        sari_add /= self.max_ngrams
        sari_keep /= self.max_ngrams
        sari_del /= self.max_ngrams

        return {"sari_score": sari_score, "sari_add": sari_add, "sari_keep": sari_keep, "sari_del": sari_del}

    def _sari_n(
        self, s_grams: list[str], c_grams: list[str], r_grams_list: list[list[str]]
    ) -> tuple[float, float, float, float]:
        num_ref = len(r_grams_list)
        r_grams_all = [r_gram for r_grams in r_grams_list for r_gram in r_grams]
        r_gram_counter = Counter(r_grams_all)

        s_gram_counter = Counter(s_grams)
        c_gram_counter = Counter(c_grams)

        s_gram_rep = Counter({k: v * num_ref for k, v in s_gram_counter.items()})
        c_gram_rep = Counter({k: v * num_ref for k, v in c_gram_counter.items()})

        # ADD
        add_grams = set(c_gram_counter) - set(s_gram_counter)
        add_good = add_grams & set(r_gram_counter)
        add_all = set(r_gram_counter) - set(s_gram_counter)

        add_prec = len(add_good) / len(add_grams) if add_grams else 1
        add_recall = len(add_good) / len(add_all) if add_all else 1
        add_f1 = 2 * add_prec * add_recall / (add_prec + add_recall) if (add_prec + add_recall) > 0 else 0

        # KEEP
        keep_rep = s_gram_rep & c_gram_rep
        keep_good = keep_rep & r_gram_counter
        keep_all = s_gram_rep & r_gram_counter

        keep_prec = sum(keep_good[g] / keep_rep[g] for g in keep_good) / len(keep_rep) if keep_rep else 1
        keep_recall = sum(keep_good[g] for g in keep_good) / sum(keep_all.values()) if keep_all else 1
        keep_f1 = 2 * keep_prec * keep_recall / (keep_prec + keep_recall) if (keep_prec + keep_recall) > 0 else 0

        # DELETE
        del_rep = s_gram_rep - c_gram_rep
        del_good = del_rep - r_gram_counter

        del_prec = sum(del_good[g] / del_rep[g] for g in del_good) / len(del_rep) if del_rep else 1

        return (add_f1 + keep_f1 + del_prec) / 3, add_f1, keep_f1, del_prec

source_key `instance-attribute` ¶

source_key = source_key

max_ngrams `instance-attribute` ¶

max_ngrams = max_ngrams

category_key `instance-attribute` ¶

category_key = category_key

source_processors `instance-attribute` ¶

source_processors = source_processor

lm_output_processors `instance-attribute` ¶

lm_output_processors = lm_output_processor

reference_processors `instance-attribute` ¶

reference_processors = reference_processor

init ¶

__init__(
    source_key: str,
    tokenizer: Tokenizer | Literal["default"] = "default",
    max_ngrams: int = 4,
    category_key: str | None = None,
    source_processor: StringProcessor
    | list[StringProcessor]
    | None = DEFAULT_STRING_PROCESSOR,
    lm_output_processor: StringProcessor
    | list[StringProcessor]
    | None = DEFAULT_STRING_PROCESSOR,
    reference_processor: StringProcessor
    | list[StringProcessor]
    | None = DEFAULT_STRING_PROCESSOR,
) -> None

Source code in flexeval/core/metric/sari.py

def __init__(
    self,
    source_key: str,
    tokenizer: Tokenizer | Literal["default"] = "default",
    max_ngrams: int = 4,
    category_key: str | None = None,
    source_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
    lm_output_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
    reference_processor: StringProcessor | list[StringProcessor] | None = DEFAULT_STRING_PROCESSOR,
) -> None:
    if tokenizer == "default":
        tokenizer = SacreBleuTokenizer("13a")
    self._tokenizer = tokenizer
    self.source_key = source_key
    self.max_ngrams = max_ngrams
    self.category_key = category_key

    self.source_processors = source_processor
    self.lm_output_processors = lm_output_processor
    self.reference_processors = reference_processor

evaluate ¶

evaluate(
    lm_outputs, references_list, extra_info_list=None
) -> MetricResult

Source code in flexeval/core/metric/sari.py

def evaluate(self, lm_outputs, references_list, extra_info_list=None) -> MetricResult:  # noqa: ANN001
    validate_inputs(lm_outputs, references_list, extra_info_list)

    if extra_info_list is None:
        msg = "SARI requires extra_info_list"
        raise ValueError(msg)
    sources = [extra_info[self.source_key] for extra_info in extra_info_list]

    # Normalize text data
    sources = [apply_string_processors(src, self.source_processors) for src in sources]
    lm_outputs = [apply_string_processors(output, self.lm_output_processors) for output in lm_outputs]
    references_list = [
        [apply_string_processors(ref, self.reference_processors) for ref in references]
        for references in references_list
    ]

    # Compute metrics
    sari_instance_list = [
        self._calc_sentence_sari(source, lm_output, references)
        for source, lm_output, references in zip(sources, lm_outputs, references_list)
    ]

    metric_name2scores = {
        name: [s[name] for s in sari_instance_list] for name in ["sari_score", "sari_add", "sari_keep", "sari_del"]
    }

    num_instances = len(sari_instance_list)
    summary = {
        metric_name: sum(score_list) / num_instances for metric_name, score_list in metric_name2scores.items()
    }

    if self.category_key:
        categories = [extra_info[self.category_key] for extra_info in extra_info_list]
        for metric_name, score_list in metric_name2scores.items():
            category_wise_scores = aggregate_category_wise_scores(score_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"{metric_name}/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=sari_instance_list,
    )

SubstringMatch ¶

A metric that calculates how many outputs contain any of the expected substrings.

Parameters:

mode (Literal['any', 'all'], default: 'any' ) –

The mode to calculate the substring match. - "any": If any of the expected substrings are in the output, it is a match. - "all": If all of the expected substrings are in the output, it is a match.
category_key (str | None, default: None ) –

Optional key to group scores by category from extra_info_list.

Examples:

>>> from flexeval import SubstringMatch
>>> substring_match = SubstringMatch()
>>> lm_outputs = ["This is a cat .", "This is a dog ."]
>>> references_list = [["cat", "dog"], ["mouse"]]
>>> result = substring_match.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'substring_match': 0.5},
    instance_details=[{'substring_match': True}, {'substring_match': False}]
)

Source code in flexeval/core/metric/substring_match.py

class SubstringMatch(Metric):
    """
    A metric that calculates how many outputs contain any of the expected substrings.

    Args:
        mode: The mode to calculate the substring match.
            - "any": If any of the expected substrings are in the output, it is a match.
            - "all": If all of the expected substrings are in the output, it is a match.
        category_key: Optional key to group scores by category from extra_info_list.

    Examples:
        >>> from flexeval import SubstringMatch
        >>> substring_match = SubstringMatch()
        >>> lm_outputs = ["This is a cat .", "This is a dog ."]
        >>> references_list = [["cat", "dog"], ["mouse"]]
        >>> result = substring_match.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'substring_match': 0.5},
            instance_details=[{'substring_match': True}, {'substring_match': False}]
        )
    """

    def __init__(self, mode: Literal["any", "all"] = "any", category_key: str | None = None) -> None:
        self.mode = mode
        self.category_key = category_key
        if mode == "all":
            self.match_func = all
        elif mode == "any":
            self.match_func = any
        else:
            msg = f"mode must be 'any' or 'all', but got '{mode}'."
            raise ValueError(msg)

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Compute metrics
        match_list = [
            self.match_func(substring in lm_output for substring in expected_output)
            for lm_output, expected_output in zip(lm_outputs, references_list)
        ]

        score = 0.0
        if len(match_list):
            score = sum(match_list) / len(match_list)

        summary = {f"substring_match-{self.mode}": score}

        if self.category_key:
            categories = [extra_info[self.category_key] for extra_info in extra_info_list]
            category_wise_scores = aggregate_category_wise_scores(match_list, categories)
            for category, category_wise_score in category_wise_scores.items():
                summary[f"substring_match-{self.mode}/{category}"] = category_wise_score

        return MetricResult(
            summary,
            instance_details=[{"substring_match": match} for match in match_list],
        )

mode `instance-attribute` ¶

mode = mode

category_key `instance-attribute` ¶

category_key = category_key

match_func `instance-attribute` ¶

match_func = all

init ¶

__init__(
    mode: Literal["any", "all"] = "any",
    category_key: str | None = None,
) -> None

Source code in flexeval/core/metric/substring_match.py

def __init__(self, mode: Literal["any", "all"] = "any", category_key: str | None = None) -> None:
    self.mode = mode
    self.category_key = category_key
    if mode == "all":
        self.match_func = all
    elif mode == "any":
        self.match_func = any
    else:
        msg = f"mode must be 'any' or 'all', but got '{mode}'."
        raise ValueError(msg)

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/substring_match.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Compute metrics
    match_list = [
        self.match_func(substring in lm_output for substring in expected_output)
        for lm_output, expected_output in zip(lm_outputs, references_list)
    ]

    score = 0.0
    if len(match_list):
        score = sum(match_list) / len(match_list)

    summary = {f"substring_match-{self.mode}": score}

    if self.category_key:
        categories = [extra_info[self.category_key] for extra_info in extra_info_list]
        category_wise_scores = aggregate_category_wise_scores(match_list, categories)
        for category, category_wise_score in category_wise_scores.items():
            summary[f"substring_match-{self.mode}/{category}"] = category_wise_score

    return MetricResult(
        summary,
        instance_details=[{"substring_match": match} for match in match_list],
    )

XER ¶

Calculate the Character Error Rate (CER) and Word Error Rate (WER) between the model outputs and the references. The calculation is based on the jiwer library.

Parameters:

tokenizer (Tokenizer | None, default: None ) –

An instance of Tokenizer to tokenize the input and output strings.

Examples:

>>> from flexeval import XER
>>> xer = XER()
>>> lm_outputs = ["I am a student .", "I am a teacher ."]
>>> references_list = [["I am a student .", "I am a learner ."], ["Are you the student ?"]]
>>> result = xer.evaluate(lm_outputs, references_list)
>>> print(result)
MetricResult(
    summary={'cer_score': 0.43243243243243246, 'wer_score': 0.5},
    instance_details=[{'cer_score': 0.0, 'wer_score': 0.0}, {'cer_score': 0.7619047619047619, 'wer_score': 1.0}
    ]
)

Source code in flexeval/core/metric/xer.py

class XER(Metric):
    """
    Calculate the Character Error Rate (CER) and Word Error Rate (WER) between the model outputs and the references.
    The calculation is based on the [jiwer](https://github.com/jitsi/jiwer) library.

    Args:
        tokenizer: An instance of `Tokenizer` to tokenize the input and output strings.

    Examples:
        >>> from flexeval import XER
        >>> xer = XER()
        >>> lm_outputs = ["I am a student .", "I am a teacher ."]
        >>> references_list = [["I am a student .", "I am a learner ."], ["Are you the student ?"]]
        >>> result = xer.evaluate(lm_outputs, references_list)
        >>> print(result)
        MetricResult(
            summary={'cer_score': 0.43243243243243246, 'wer_score': 0.5},
            instance_details=[{'cer_score': 0.0, 'wer_score': 0.0}, {'cer_score': 0.7619047619047619, 'wer_score': 1.0}
            ]
        )
    """

    def __init__(self, tokenizer: Tokenizer | None = None) -> None:
        self.tokenizer = tokenizer

    def evaluate(
        self,
        lm_outputs: list[str],
        references_list: list[list[str]],
        extra_info_list: list[dict[str, str]] | None = None,
    ) -> MetricResult:
        validate_inputs(lm_outputs, references_list, extra_info_list)

        # Normalize text data - we only need the first reference
        references = [references[0] for references in references_list]

        if self.tokenizer:
            tokenized_lm_outputs = [" ".join(self.tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
            tokenized_references = [" ".join(self.tokenizer.tokenize(reference)) for reference in references]
        else:
            tokenized_lm_outputs = lm_outputs
            tokenized_references = references

        # Compute metrics
        cer_score = cer(references, lm_outputs)
        wer_score = wer(tokenized_references, tokenized_lm_outputs)

        return MetricResult(
            {
                "cer_score": cer_score,
                "wer_score": wer_score,
            },
            instance_details=[
                {
                    "cer_score": cer(reference, lm_output),
                    "wer_score": wer(reference, lm_output),
                }
                for lm_output, reference in zip(lm_outputs, references)
            ],
        )

tokenizer `instance-attribute` ¶

tokenizer = tokenizer

init ¶

__init__(tokenizer: Tokenizer | None = None) -> None

Source code in flexeval/core/metric/xer.py

def __init__(self, tokenizer: Tokenizer | None = None) -> None:
    self.tokenizer = tokenizer

evaluate ¶

evaluate(
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult

Source code in flexeval/core/metric/xer.py

def evaluate(
    self,
    lm_outputs: list[str],
    references_list: list[list[str]],
    extra_info_list: list[dict[str, str]] | None = None,
) -> MetricResult:
    validate_inputs(lm_outputs, references_list, extra_info_list)

    # Normalize text data - we only need the first reference
    references = [references[0] for references in references_list]

    if self.tokenizer:
        tokenized_lm_outputs = [" ".join(self.tokenizer.tokenize(lm_output)) for lm_output in lm_outputs]
        tokenized_references = [" ".join(self.tokenizer.tokenize(reference)) for reference in references]
    else:
        tokenized_lm_outputs = lm_outputs
        tokenized_references = references

    # Compute metrics
    cer_score = cer(references, lm_outputs)
    wer_score = wer(tokenized_references, tokenized_lm_outputs)

    return MetricResult(
        {
            "cer_score": cer_score,
            "wer_score": wer_score,
        },
        instance_details=[
            {
                "cer_score": cer(reference, lm_output),
                "wer_score": wer(reference, lm_output),
            }
            for lm_output, reference in zip(lm_outputs, references)
        ],
    )

Metric

Metric ¶

evaluate abstractmethod ¶

MetricResult dataclass ¶

summary instance-attribute ¶

instance_details class-attribute instance-attribute ¶

__init__ ¶

BLEU ¶

lm_output_processors instance-attribute ¶

reference_processors instance-attribute ¶

category_key instance-attribute ¶

__init__ ¶

evaluate ¶

CharF1 ¶

lm_output_processors instance-attribute ¶

reference_processors instance-attribute ¶

category_key instance-attribute ¶

__init__ ¶

evaluate ¶

CodeEval ¶

code_template instance-attribute ¶

code_eval instance-attribute ¶

lm_output_processors instance-attribute ¶

__init__ ¶

evaluate ¶

CommonPrefixLength ¶

evaluate ¶

CommonStringLength ¶

evaluate ¶

Correlation ¶

method instance-attribute ¶

lm_output_processors instance-attribute ¶

reference_processors instance-attribute ¶

__init__ ¶

evaluate ¶

ExactMatch ¶

lm_output_processors instance-attribute ¶

reference_processors instance-attribute ¶

category_key instance-attribute ¶

__init__ ¶

evaluate ¶

ChatLLMGEvalScore ¶

language_model instance-attribute ¶

prompt_template instance-attribute ¶

batch_size instance-attribute ¶

system_message instance-attribute ¶

disable_tqdm instance-attribute ¶

valid_score_range instance-attribute ¶

category_key instance-attribute ¶

prob_threshold instance-attribute ¶

valid_labels instance-attribute ¶

__init__ ¶

evaluate ¶

__repr__ ¶

LLMGEvalScore ¶

language_model instance-attribute ¶

prompt_template instance-attribute ¶

batch_size instance-attribute ¶

disable_tqdm instance-attribute ¶

valid_score_range instance-attribute ¶

category_key instance-attribute ¶

prob_threshold instance-attribute ¶

valid_labels instance-attribute ¶

__init__ ¶

evaluate ¶

__repr__ ¶

ChatLLMLabel ¶

language_model instance-attribute ¶

prompt_template instance-attribute ¶

label_names instance-attribute ¶

weights instance-attribute ¶

system_message instance-attribute ¶

batch_size instance-attribute ¶

disable_tqdm instance-attribute ¶

category_key instance-attribute ¶

__init__ ¶

evaluate ¶

__repr__ ¶

LLMLabel ¶

language_model instance-attribute ¶

evaluate `abstractmethod` ¶

MetricResult `dataclass` ¶

summary `instance-attribute` ¶

instance_details `class-attribute` `instance-attribute` ¶

init ¶

lm_output_processors `instance-attribute` ¶

reference_processors `instance-attribute` ¶

category_key `instance-attribute` ¶

init ¶

lm_output_processors `instance-attribute` ¶

reference_processors `instance-attribute` ¶

category_key `instance-attribute` ¶

init ¶

code_template `instance-attribute` ¶

code_eval `instance-attribute` ¶

lm_output_processors `instance-attribute` ¶

init ¶

method `instance-attribute` ¶

lm_output_processors `instance-attribute` ¶

reference_processors `instance-attribute` ¶

init ¶

lm_output_processors `instance-attribute` ¶

reference_processors `instance-attribute` ¶

category_key `instance-attribute` ¶

init ¶

language_model `instance-attribute` ¶

prompt_template `instance-attribute` ¶

batch_size `instance-attribute` ¶

system_message `instance-attribute` ¶

disable_tqdm `instance-attribute` ¶

valid_score_range `instance-attribute` ¶

category_key `instance-attribute` ¶

prob_threshold `instance-attribute` ¶

valid_labels `instance-attribute` ¶

init ¶

repr ¶

language_model `instance-attribute` ¶

prompt_template `instance-attribute` ¶

batch_size `instance-attribute` ¶

disable_tqdm `instance-attribute` ¶

valid_score_range `instance-attribute` ¶

category_key `instance-attribute` ¶

prob_threshold `instance-attribute` ¶

valid_labels `instance-attribute` ¶

init ¶

repr ¶

language_model `instance-attribute` ¶

prompt_template `instance-attribute` ¶

label_names `instance-attribute` ¶

weights `instance-attribute` ¶

system_message `instance-attribute` ¶

batch_size `instance-attribute` ¶

disable_tqdm `instance-attribute` ¶

category_key `instance-attribute` ¶

init ¶

repr ¶

language_model `instance-attribute` ¶

prompt_template `instance-attribute` ¶

label_names `instance-attribute` ¶

weights `instance-attribute` ¶

batch_size `instance-attribute` ¶

disable_tqdm `instance-attribute` ¶

valid_score_range `instance-attribute` ¶

category_key `instance-attribute` ¶

init ¶

repr ¶

language_model `instance-attribute` ¶

prompt_template `instance-attribute` ¶

system_message `instance-attribute` ¶

batch_size `instance-attribute` ¶

disable_tqdm `instance-attribute` ¶

valid_score_range `instance-attribute` ¶

category_key `instance-attribute` ¶

init ¶

repr ¶

language_model `instance-attribute` ¶

prompt_template `instance-attribute` ¶

batch_size `instance-attribute` ¶

disable_tqdm `instance-attribute` ¶

valid_score_range `instance-attribute` ¶