Skip to content

PairwiseScorer

PairwiseScorer

Compute scores for each model given the match results.

Each match result is a triple of two model names and the winner.

Source code in flexeval/core/pairwise_comparison/scorer/base.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
class PairwiseScorer(ABC):
    """Compute scores for each model given the match results.

    Each match result is a triple of two model names and the winner.
    """

    name: str = None

    @abstractmethod
    def compute_scores(
        self: PairwiseScorer,
        match_results: list[tuple[str, str, Winner]],
    ) -> dict[str, float]:
        pass

    @classmethod
    def get_name(cls: type[PairwiseScorer]) -> str:
        return cls.name if cls.name else cls.__name__

name class-attribute instance-attribute

name: str = None

compute_scores abstractmethod

compute_scores(
    match_results: list[tuple[str, str, Winner]],
) -> dict[str, float]
Source code in flexeval/core/pairwise_comparison/scorer/base.py
16
17
18
19
20
21
@abstractmethod
def compute_scores(
    self: PairwiseScorer,
    match_results: list[tuple[str, str, Winner]],
) -> dict[str, float]:
    pass

get_name classmethod

get_name() -> str
Source code in flexeval/core/pairwise_comparison/scorer/base.py
23
24
25
@classmethod
def get_name(cls: type[PairwiseScorer]) -> str:
    return cls.name if cls.name else cls.__name__

BradleyTerryScorer

Source code in flexeval/core/pairwise_comparison/scorer/bradley_terry.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
class BradleyTerryScorer(PairwiseScorer):
    name: str = "bradley_terry"

    def __init__(
        self,
        max_iters: int = 1000,
        error_tol: float = 1e-3,
        eps: float = 1e-8,
        base: float = 10.0,
        scale: float = 400.0,
        init_rating: float = 1000.0,
    ) -> None:
        self.max_iters = max_iters
        self.error_tol = error_tol
        self.eps = eps
        self.base = base
        self.scale = scale
        self.init_rating = init_rating

    def _gen_winloss_matrix(
        self,
        match_results: list[tuple[str, str, Winner]],
    ) -> dict[str, dict[str, float]]:
        """戦績を受け取り、 `matrix[モデル1][モデル2] = <モデル1がモデル2に勝った回数>` となるような辞書を返す"""
        matrix = defaultdict(lambda: defaultdict(float))

        for model1, model2, winner in match_results:
            if winner == Winner.MODEL1:
                matrix[model1][model2] += 1.0
            elif winner == Winner.MODEL2:
                matrix[model2][model1] += 1.0
            elif winner == Winner.DRAW:
                matrix[model1][model2] += 0.5
                matrix[model2][model1] += 0.5

        return matrix

    def compute_scores(
        self,
        match_results: list[tuple[str, str, Winner]],
    ) -> dict[str, float]:
        """戦績を受け取り、Bradley-Terry model (MLE) で推定した各モデルのスコアを返す。"""
        model_names = sorted(
            {m[0] for m in match_results} | {m[1] for m in match_results},
        )
        winloss_matrix = self._gen_winloss_matrix(match_results)

        # https://jmlr.org/papers/volume24/22-1086/22-1086.pdf#page=5.50 (12)
        scores = pd.Series(np.ones(len(model_names)), index=model_names)
        for iters in range(self.max_iters):
            old_scores = scores.copy()
            for target_model in scores.keys():  # noqa: SIM118
                numer = sum(
                    [
                        (winloss_matrix[target_model][other_model] * scores[other_model])
                        / (scores[target_model] + scores[other_model])
                        for other_model in winloss_matrix[target_model]
                    ],
                )
                denom = sum(
                    [
                        (winloss_matrix[other_model][target_model]) / (scores[target_model] + scores[other_model])
                        for other_model in winloss_matrix[target_model]
                    ],
                )

                scores[target_model] = numer / (denom + self.eps)

            scores /= np.exp(np.log(scores).sum()) ** (1 / len(scores))

            if (scores - old_scores).abs().sum() < self.error_tol:
                logger.info(f" * Converged after {iters} iterations.")
                break
        else:
            logger.info(
                f" * Max iterations reached ({self.max_iters} iters).",
            )

        return (
            scores.apply(
                lambda x: self.scale / np.log(self.base) * np.log(x) + self.init_rating,
            )
            .sort_values(ascending=False)
            .to_dict()
        )

name class-attribute instance-attribute

name: str = 'bradley_terry'

max_iters instance-attribute

max_iters = max_iters

error_tol instance-attribute

error_tol = error_tol

eps instance-attribute

eps = eps

base instance-attribute

base = base

scale instance-attribute

scale = scale

init_rating instance-attribute

init_rating = init_rating

__init__

__init__(
    max_iters: int = 1000,
    error_tol: float = 0.001,
    eps: float = 1e-08,
    base: float = 10.0,
    scale: float = 400.0,
    init_rating: float = 1000.0,
) -> None
Source code in flexeval/core/pairwise_comparison/scorer/bradley_terry.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def __init__(
    self,
    max_iters: int = 1000,
    error_tol: float = 1e-3,
    eps: float = 1e-8,
    base: float = 10.0,
    scale: float = 400.0,
    init_rating: float = 1000.0,
) -> None:
    self.max_iters = max_iters
    self.error_tol = error_tol
    self.eps = eps
    self.base = base
    self.scale = scale
    self.init_rating = init_rating

compute_scores

compute_scores(
    match_results: list[tuple[str, str, Winner]],
) -> dict[str, float]

戦績を受け取り、Bradley-Terry model (MLE) で推定した各モデルのスコアを返す。

Source code in flexeval/core/pairwise_comparison/scorer/bradley_terry.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def compute_scores(
    self,
    match_results: list[tuple[str, str, Winner]],
) -> dict[str, float]:
    """戦績を受け取り、Bradley-Terry model (MLE) で推定した各モデルのスコアを返す。"""
    model_names = sorted(
        {m[0] for m in match_results} | {m[1] for m in match_results},
    )
    winloss_matrix = self._gen_winloss_matrix(match_results)

    # https://jmlr.org/papers/volume24/22-1086/22-1086.pdf#page=5.50 (12)
    scores = pd.Series(np.ones(len(model_names)), index=model_names)
    for iters in range(self.max_iters):
        old_scores = scores.copy()
        for target_model in scores.keys():  # noqa: SIM118
            numer = sum(
                [
                    (winloss_matrix[target_model][other_model] * scores[other_model])
                    / (scores[target_model] + scores[other_model])
                    for other_model in winloss_matrix[target_model]
                ],
            )
            denom = sum(
                [
                    (winloss_matrix[other_model][target_model]) / (scores[target_model] + scores[other_model])
                    for other_model in winloss_matrix[target_model]
                ],
            )

            scores[target_model] = numer / (denom + self.eps)

        scores /= np.exp(np.log(scores).sum()) ** (1 / len(scores))

        if (scores - old_scores).abs().sum() < self.error_tol:
            logger.info(f" * Converged after {iters} iterations.")
            break
    else:
        logger.info(
            f" * Max iterations reached ({self.max_iters} iters).",
        )

    return (
        scores.apply(
            lambda x: self.scale / np.log(self.base) * np.log(x) + self.init_rating,
        )
        .sort_values(ascending=False)
        .to_dict()
    )

WinRateScorer

Source code in flexeval/core/pairwise_comparison/scorer/win_rate.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class WinRateScorer(PairwiseScorer):
    name: str = "win_rate"

    def compute_scores(
        self,
        match_results: list[tuple[str, str, Winner]],
    ) -> dict[str, float]:
        """戦績を受け取り、各モデルの勝率を返す。"""
        match_count_dict: dict[str, float] = defaultdict(float)
        win_count_dict: dict[str, float] = defaultdict(float)

        for model1, model2, winner in match_results:
            match_count_dict[model1] += 1
            match_count_dict[model2] += 1
            if winner == Winner.MODEL1:
                win_count_dict[model1] += 1
            elif winner == Winner.MODEL2:
                win_count_dict[model2] += 1
            elif winner == Winner.DRAW:
                win_count_dict[model1] += 0.5
                win_count_dict[model2] += 0.5

        win_rate_dict = {}
        for model in match_count_dict:
            win_rate_dict[model] = 100 * win_count_dict.get(model, 0.0) / match_count_dict[model]

        return dict(sorted(win_rate_dict.items(), key=lambda x: -x[1]))

name class-attribute instance-attribute

name: str = 'win_rate'

compute_scores

compute_scores(
    match_results: list[tuple[str, str, Winner]],
) -> dict[str, float]

戦績を受け取り、各モデルの勝率を返す。

Source code in flexeval/core/pairwise_comparison/scorer/win_rate.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def compute_scores(
    self,
    match_results: list[tuple[str, str, Winner]],
) -> dict[str, float]:
    """戦績を受け取り、各モデルの勝率を返す。"""
    match_count_dict: dict[str, float] = defaultdict(float)
    win_count_dict: dict[str, float] = defaultdict(float)

    for model1, model2, winner in match_results:
        match_count_dict[model1] += 1
        match_count_dict[model2] += 1
        if winner == Winner.MODEL1:
            win_count_dict[model1] += 1
        elif winner == Winner.MODEL2:
            win_count_dict[model2] += 1
        elif winner == Winner.DRAW:
            win_count_dict[model1] += 0.5
            win_count_dict[model2] += 0.5

    win_rate_dict = {}
    for model in match_count_dict:
        win_rate_dict[model] = 100 * win_count_dict.get(model, 0.0) / match_count_dict[model]

    return dict(sorted(win_rate_dict.items(), key=lambda x: -x[1]))