Skip to content

ResultRecorder

ResultRecorder

An abstract base class for recording experiment results, including configuration, metrics, and model outputs.

This class defines the interface for different result recording implementations, such as saving to a local directory, uploading to wandb, or integrating with MLflow.

Source code in flexeval/core/result_recorder/base.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
class ResultRecorder(ABC):
    """
    An abstract base class for recording experiment results, including configuration,
    metrics, and model outputs.

    This class defines the interface for different result recording implementations,
    such as saving to a local directory, uploading to wandb, or integrating with MLflow.
    """

    @abstractmethod
    def record_config(self, config: dict[str, Any], group: str | None = None) -> None:
        """
        Record the configuration parameters of the experiment.

        Args:
            config: A dictionary containing the configuration
                parameters of the evaluation.
            group: An optional group name to organize the configuration.
        """

    @abstractmethod
    def record_metrics(self, metrics: dict[str, Any], group: str | None = None) -> None:
        """
        Record the evaluation metrics of the experiment.

        Args:
            metrics: A dictionary containing the evaluation metrics,
                where keys are metric names and values are the corresponding results.
            group: An optional group name to organize the metrics.
        """

    @abstractmethod
    def record_model_outputs(self, model_outputs: list[dict[str, Any]], group: str | None = None) -> None:
        """
        Record the outputs generated by the model during evaluation.

        Args:
            model_outputs: A list of dictionaries, where each
                dictionary represents a single model output. The structure of these
                dictionaries may vary depending on the specific model and task.
            group: An optional group name to organize the model outputs.
        """

record_config abstractmethod

record_config(
    config: dict[str, Any], group: str | None = None
) -> None

Record the configuration parameters of the experiment.

Parameters:

  • config (dict[str, Any]) –

    A dictionary containing the configuration parameters of the evaluation.

  • group (str | None, default: None ) –

    An optional group name to organize the configuration.

Source code in flexeval/core/result_recorder/base.py
16
17
18
19
20
21
22
23
24
25
@abstractmethod
def record_config(self, config: dict[str, Any], group: str | None = None) -> None:
    """
    Record the configuration parameters of the experiment.

    Args:
        config: A dictionary containing the configuration
            parameters of the evaluation.
        group: An optional group name to organize the configuration.
    """

record_metrics abstractmethod

record_metrics(
    metrics: dict[str, Any], group: str | None = None
) -> None

Record the evaluation metrics of the experiment.

Parameters:

  • metrics (dict[str, Any]) –

    A dictionary containing the evaluation metrics, where keys are metric names and values are the corresponding results.

  • group (str | None, default: None ) –

    An optional group name to organize the metrics.

Source code in flexeval/core/result_recorder/base.py
27
28
29
30
31
32
33
34
35
36
@abstractmethod
def record_metrics(self, metrics: dict[str, Any], group: str | None = None) -> None:
    """
    Record the evaluation metrics of the experiment.

    Args:
        metrics: A dictionary containing the evaluation metrics,
            where keys are metric names and values are the corresponding results.
        group: An optional group name to organize the metrics.
    """

record_model_outputs abstractmethod

record_model_outputs(
    model_outputs: list[dict[str, Any]],
    group: str | None = None,
) -> None

Record the outputs generated by the model during evaluation.

Parameters:

  • model_outputs (list[dict[str, Any]]) –

    A list of dictionaries, where each dictionary represents a single model output. The structure of these dictionaries may vary depending on the specific model and task.

  • group (str | None, default: None ) –

    An optional group name to organize the model outputs.

Source code in flexeval/core/result_recorder/base.py
38
39
40
41
42
43
44
45
46
47
48
@abstractmethod
def record_model_outputs(self, model_outputs: list[dict[str, Any]], group: str | None = None) -> None:
    """
    Record the outputs generated by the model during evaluation.

    Args:
        model_outputs: A list of dictionaries, where each
            dictionary represents a single model output. The structure of these
            dictionaries may vary depending on the specific model and task.
        group: An optional group name to organize the model outputs.
    """

LocalRecorder

A class to record the results in JSON format.

Parameters:

  • output_dir (str) –

    The directory to save the results.

Source code in flexeval/core/result_recorder/local_recorder.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class LocalRecorder(ResultRecorder):
    """
    A class to record the results in JSON format.

    Args:
        output_dir: The directory to save the results.
    """

    def __init__(self, output_dir: str, force: bool = False) -> None:
        self.output_dir = Path(output_dir)
        self.force = force

    @staticmethod
    def _check_output_dir_exists(output_dir: str | PathLike[str], checked_files: list[str]) -> None:
        output_dir = Path(output_dir)
        for file_name in checked_files:
            if (output_dir / file_name).exists():
                msg = (
                    f"`{output_dir / file_name}` already exists. If you want to overwrite it, "
                    f"please specify `--force true` from CLI or `force=True` when initializing the recorder."
                )
                raise FileExistsError(msg)

    def record_config(self, config: dict[str, Any], group: str | None = None) -> None:
        output_dir = self.output_dir
        if group is not None:
            output_dir = self.output_dir / group

        if not self.force:
            self._check_output_dir_exists(output_dir, [CONFIG_FILE_NAME])

        save_json(config, output_dir / CONFIG_FILE_NAME)
        logger.info(f"Saved the config to {output_dir / CONFIG_FILE_NAME}")

    def record_metrics(self, metrics: dict[str, Any], group: str | None = None) -> None:
        output_dir = self.output_dir
        if group is not None:
            output_dir = self.output_dir / group

        if not self.force:
            self._check_output_dir_exists(output_dir, [METRIC_FILE_NAME])

        save_json(metrics, output_dir / METRIC_FILE_NAME)
        logger.info(f"Saved the metrics to {output_dir / METRIC_FILE_NAME}")

    def record_model_outputs(self, model_outputs: list[dict[str, Any]], group: str | None = None) -> None:
        output_dir = self.output_dir
        if group is not None:
            output_dir = output_dir / group

        if not self.force:
            self._check_output_dir_exists(output_dir, [OUTPUTS_FILE_NAME])

        save_jsonl(model_outputs, output_dir / OUTPUTS_FILE_NAME)
        logger.info(f"Saved the outputs to {output_dir / OUTPUTS_FILE_NAME}")

output_dir instance-attribute

output_dir = Path(output_dir)

force instance-attribute

force = force

__init__

__init__(output_dir: str, force: bool = False) -> None
Source code in flexeval/core/result_recorder/local_recorder.py
46
47
48
def __init__(self, output_dir: str, force: bool = False) -> None:
    self.output_dir = Path(output_dir)
    self.force = force

record_config

record_config(
    config: dict[str, Any], group: str | None = None
) -> None
Source code in flexeval/core/result_recorder/local_recorder.py
61
62
63
64
65
66
67
68
69
70
def record_config(self, config: dict[str, Any], group: str | None = None) -> None:
    output_dir = self.output_dir
    if group is not None:
        output_dir = self.output_dir / group

    if not self.force:
        self._check_output_dir_exists(output_dir, [CONFIG_FILE_NAME])

    save_json(config, output_dir / CONFIG_FILE_NAME)
    logger.info(f"Saved the config to {output_dir / CONFIG_FILE_NAME}")

record_metrics

record_metrics(
    metrics: dict[str, Any], group: str | None = None
) -> None
Source code in flexeval/core/result_recorder/local_recorder.py
72
73
74
75
76
77
78
79
80
81
def record_metrics(self, metrics: dict[str, Any], group: str | None = None) -> None:
    output_dir = self.output_dir
    if group is not None:
        output_dir = self.output_dir / group

    if not self.force:
        self._check_output_dir_exists(output_dir, [METRIC_FILE_NAME])

    save_json(metrics, output_dir / METRIC_FILE_NAME)
    logger.info(f"Saved the metrics to {output_dir / METRIC_FILE_NAME}")

record_model_outputs

record_model_outputs(
    model_outputs: list[dict[str, Any]],
    group: str | None = None,
) -> None
Source code in flexeval/core/result_recorder/local_recorder.py
83
84
85
86
87
88
89
90
91
92
def record_model_outputs(self, model_outputs: list[dict[str, Any]], group: str | None = None) -> None:
    output_dir = self.output_dir
    if group is not None:
        output_dir = output_dir / group

    if not self.force:
        self._check_output_dir_exists(output_dir, [OUTPUTS_FILE_NAME])

    save_jsonl(model_outputs, output_dir / OUTPUTS_FILE_NAME)
    logger.info(f"Saved the outputs to {output_dir / OUTPUTS_FILE_NAME}")

WandBRecorder

A class to record the results to Weights & Biases.

Parameters:

  • init_kwargs (dict[str, Any] | None, default: None ) –

    The arguments for the wandb.init function. Please refer to the official documentation for the details.

Source code in flexeval/core/result_recorder/wandb_recorder.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
class WandBRecorder(ResultRecorder):
    """
    A class to record the results to Weights & Biases.

    Args:
        init_kwargs: The arguments for the `wandb.init` function.
            Please refer to [the official documentation](https://docs.wandb.ai/ref/python/init) for the details.
    """

    def __init__(
        self,
        init_kwargs: dict[str, Any] | None = None,
    ) -> None:
        import wandb

        self._wandb = wandb
        init_kwargs = init_kwargs or {}
        self._wandb.init(**init_kwargs)

    def record_config(self, config: dict[str, Any], group: str | None = None) -> None:
        if group:
            self._wandb.config.update({group: config})
        else:
            self._wandb.config.update(config)

    def record_metrics(self, metrics: dict[str, Any], group: str | None = None) -> None:
        if group:
            self._wandb.summary.update({group: metrics})
        else:
            self._wandb.summary.update(metrics)

    def record_model_outputs(self, model_outputs: list[dict[str, Any]], group: str | None = None) -> None:
        table = self._wandb.Table(columns=list(model_outputs[0].keys()))

        for output in model_outputs:
            table.add_data(*output.values())

        table_name = "model_outputs" if group is None else f"{group}/model_outputs"
        self._wandb.log({table_name: table})

    def __del__(self) -> None:
        self._wandb.finish()

__init__

__init__(init_kwargs: dict[str, Any] | None = None) -> None
Source code in flexeval/core/result_recorder/wandb_recorder.py
17
18
19
20
21
22
23
24
25
def __init__(
    self,
    init_kwargs: dict[str, Any] | None = None,
) -> None:
    import wandb

    self._wandb = wandb
    init_kwargs = init_kwargs or {}
    self._wandb.init(**init_kwargs)

record_config

record_config(
    config: dict[str, Any], group: str | None = None
) -> None
Source code in flexeval/core/result_recorder/wandb_recorder.py
27
28
29
30
31
def record_config(self, config: dict[str, Any], group: str | None = None) -> None:
    if group:
        self._wandb.config.update({group: config})
    else:
        self._wandb.config.update(config)

record_metrics

record_metrics(
    metrics: dict[str, Any], group: str | None = None
) -> None
Source code in flexeval/core/result_recorder/wandb_recorder.py
33
34
35
36
37
def record_metrics(self, metrics: dict[str, Any], group: str | None = None) -> None:
    if group:
        self._wandb.summary.update({group: metrics})
    else:
        self._wandb.summary.update(metrics)

record_model_outputs

record_model_outputs(
    model_outputs: list[dict[str, Any]],
    group: str | None = None,
) -> None
Source code in flexeval/core/result_recorder/wandb_recorder.py
39
40
41
42
43
44
45
46
def record_model_outputs(self, model_outputs: list[dict[str, Any]], group: str | None = None) -> None:
    table = self._wandb.Table(columns=list(model_outputs[0].keys()))

    for output in model_outputs:
        table.add_data(*output.values())

    table_name = "model_outputs" if group is None else f"{group}/model_outputs"
    self._wandb.log({table_name: table})

__del__

__del__() -> None
Source code in flexeval/core/result_recorder/wandb_recorder.py
48
49
def __del__(self) -> None:
    self._wandb.finish()