Skip to content

FewShotGenerator

FewShotGenerator

Source code in flexeval/core/few_shot_generator/base.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class FewShotGenerator(ABC):
    def __init__(self, num_trials_to_avoid_leak: int) -> None:
        self._num_trials_to_avoid_leak = num_trials_to_avoid_leak

    @abstractmethod
    def _sample_instances(self, eval_inputs: list[dict[str, Any]] | dict[str, Any] | None = None) -> list[Instance]:
        """
        Sample instances for few-shot learning.
        This method should be implemented in the derived class.
        """
        raise NotImplementedError

    def __call__(self, eval_inputs: list[dict[str, Any]] | dict[str, Any] | None = None) -> list[Instance]:
        """
        Sample instances for few-shot learning.
        This method calls `_sample_instances` and
        checks if the sampled instances have the same inputs as the evaluation instance.

        Args:
            eval_inputs: The inputs of the evaluation instance.
                This is used to avoid data leakage
                by checking if the sampled instances have the same inputs as the evaluation instance.

        Returns:
            A list of instances for few-shot learning.
        """
        sampled_instances = self._sample_instances(eval_inputs=eval_inputs)

        # check if the sampled instances are the same as the eval_instance
        if self._num_trials_to_avoid_leak and eval_inputs is not None:
            for _ in range(self._num_trials_to_avoid_leak):
                if all(sampled.inputs != eval_inputs for sampled in sampled_instances):
                    return sampled_instances
                # retry sampling
                sampled_instances = self._sample_instances(eval_inputs=eval_inputs)

            msg = (
                f"Few-shot instance has the same inputs as the evaluation instance, "
                f"which indicates a data leak. "
                f"Failed to sample a different instance after {self._num_trials_to_avoid_leak} trials."
            )
            raise ValueError(msg)

        return sampled_instances

__init__

__init__(num_trials_to_avoid_leak: int) -> None
Source code in flexeval/core/few_shot_generator/base.py
15
16
def __init__(self, num_trials_to_avoid_leak: int) -> None:
    self._num_trials_to_avoid_leak = num_trials_to_avoid_leak

__call__

__call__(
    eval_inputs: list[dict[str, Any]]
    | dict[str, Any]
    | None = None,
) -> list[Instance]

Sample instances for few-shot learning. This method calls _sample_instances and checks if the sampled instances have the same inputs as the evaluation instance.

Parameters:

  • eval_inputs (list[dict[str, Any]] | dict[str, Any] | None, default: None ) –

    The inputs of the evaluation instance. This is used to avoid data leakage by checking if the sampled instances have the same inputs as the evaluation instance.

Returns:

  • list[Instance]

    A list of instances for few-shot learning.

Source code in flexeval/core/few_shot_generator/base.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def __call__(self, eval_inputs: list[dict[str, Any]] | dict[str, Any] | None = None) -> list[Instance]:
    """
    Sample instances for few-shot learning.
    This method calls `_sample_instances` and
    checks if the sampled instances have the same inputs as the evaluation instance.

    Args:
        eval_inputs: The inputs of the evaluation instance.
            This is used to avoid data leakage
            by checking if the sampled instances have the same inputs as the evaluation instance.

    Returns:
        A list of instances for few-shot learning.
    """
    sampled_instances = self._sample_instances(eval_inputs=eval_inputs)

    # check if the sampled instances are the same as the eval_instance
    if self._num_trials_to_avoid_leak and eval_inputs is not None:
        for _ in range(self._num_trials_to_avoid_leak):
            if all(sampled.inputs != eval_inputs for sampled in sampled_instances):
                return sampled_instances
            # retry sampling
            sampled_instances = self._sample_instances(eval_inputs=eval_inputs)

        msg = (
            f"Few-shot instance has the same inputs as the evaluation instance, "
            f"which indicates a data leak. "
            f"Failed to sample a different instance after {self._num_trials_to_avoid_leak} trials."
        )
        raise ValueError(msg)

    return sampled_instances

BalancedFewShotGenerator

Source code in flexeval/core/few_shot_generator/balanced.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class BalancedFewShotGenerator(FewShotGenerator):
    def __init__(
        self,
        dataset: GenerationDataset,
        num_shots: int,
        seed: int = 42,
        num_trials_to_avoid_leak: int = 3,
    ) -> None:
        super().__init__(num_trials_to_avoid_leak=num_trials_to_avoid_leak)
        if not isinstance(dataset, GenerationDataset):
            msg = "BalancedFewShotGenerator only supports GenerationDataset"
            raise TypeError(msg)

        if num_shots > len(dataset):
            msg = (
                f"`num_shots` should be less than or equal to the number of instances in `dataset`. "
                f"num_shots: {num_shots}, len(dataset): {len(dataset)}"
            )
            raise ValueError(msg)

        self.dataset = dataset
        self.num_shots = num_shots
        self._rnd = random.Random(seed)

        # Separate instances by label
        # Here we assume that the label is the first element of references of the instance.
        label_to_ids: dict[str, list[int]] = defaultdict(list)
        for i, instance in enumerate(dataset):
            label_to_ids[instance.references[0]].append(i)
        self._label_to_ids = label_to_ids

    def _sample_instances(
        self,
        eval_inputs: list[dict[str, Any]] | dict[str, Any] | None = None,
    ) -> list[GenerationInstance]:
        # Shuffle labels
        labels = list(self._label_to_ids.keys())
        self._rnd.shuffle(labels)

        # Evenly distribute num_samples to each label
        num_samples_list = [self.num_shots // len(labels)] * len(labels)
        remaining_samples = self.num_shots % len(labels)
        for i in range(remaining_samples):
            num_samples_list[i] += 1

        # Sample instances from each label
        sampled_indices: list[int] = []
        for label, num_samples_for_the_label in zip(labels, num_samples_list):
            sampled_indices += self._rnd.sample(
                self._label_to_ids[label],
                num_samples_for_the_label,
            )
        self._rnd.shuffle(sampled_indices)

        return [self.dataset[i] for i in sampled_indices]

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(dataset={self.dataset}, num_shots={self.num_shots})"

dataset instance-attribute

dataset = dataset

num_shots instance-attribute

num_shots = num_shots

__init__

__init__(
    dataset: GenerationDataset,
    num_shots: int,
    seed: int = 42,
    num_trials_to_avoid_leak: int = 3,
) -> None
Source code in flexeval/core/few_shot_generator/balanced.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def __init__(
    self,
    dataset: GenerationDataset,
    num_shots: int,
    seed: int = 42,
    num_trials_to_avoid_leak: int = 3,
) -> None:
    super().__init__(num_trials_to_avoid_leak=num_trials_to_avoid_leak)
    if not isinstance(dataset, GenerationDataset):
        msg = "BalancedFewShotGenerator only supports GenerationDataset"
        raise TypeError(msg)

    if num_shots > len(dataset):
        msg = (
            f"`num_shots` should be less than or equal to the number of instances in `dataset`. "
            f"num_shots: {num_shots}, len(dataset): {len(dataset)}"
        )
        raise ValueError(msg)

    self.dataset = dataset
    self.num_shots = num_shots
    self._rnd = random.Random(seed)

    # Separate instances by label
    # Here we assume that the label is the first element of references of the instance.
    label_to_ids: dict[str, list[int]] = defaultdict(list)
    for i, instance in enumerate(dataset):
        label_to_ids[instance.references[0]].append(i)
    self._label_to_ids = label_to_ids

__repr__

__repr__() -> str
Source code in flexeval/core/few_shot_generator/balanced.py
66
67
def __repr__(self) -> str:
    return f"{self.__class__.__name__}(dataset={self.dataset}, num_shots={self.num_shots})"

FixedFewShotGenerator

Source code in flexeval/core/few_shot_generator/fixed.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
class FixedFewShotGenerator(FewShotGenerator):
    def __init__(self, instance_class: str, instance_params: list[dict[str, Any]]) -> None:
        super().__init__(num_trials_to_avoid_leak=0)

        if instance_class == "GenerationInstance":
            instance_init = GenerationInstance
        elif instance_class == "MultipleChoiceInstance":
            instance_init = MultipleChoiceInstance
        elif instance_class == "ChatInstance":
            instance_init = ChatInstance
        else:
            msg = f"Unknown instance class: {instance_class}"
            raise ValueError(msg)

        self.instances = [instance_init(**params) for params in instance_params]

    def _sample_instances(self, eval_inputs: list[dict[str, Any]] | dict[str, Any] | None = None) -> list[Instance]:
        return self.instances

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(instances={self.instances})"

instances instance-attribute

instances = [
    instance_init(**params) for params in instance_params
]

__init__

__init__(
    instance_class: str,
    instance_params: list[dict[str, Any]],
) -> None
Source code in flexeval/core/few_shot_generator/fixed.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
def __init__(self, instance_class: str, instance_params: list[dict[str, Any]]) -> None:
    super().__init__(num_trials_to_avoid_leak=0)

    if instance_class == "GenerationInstance":
        instance_init = GenerationInstance
    elif instance_class == "MultipleChoiceInstance":
        instance_init = MultipleChoiceInstance
    elif instance_class == "ChatInstance":
        instance_init = ChatInstance
    else:
        msg = f"Unknown instance class: {instance_class}"
        raise ValueError(msg)

    self.instances = [instance_init(**params) for params in instance_params]

__repr__

__repr__() -> str
Source code in flexeval/core/few_shot_generator/fixed.py
27
28
def __repr__(self) -> str:
    return f"{self.__class__.__name__}(instances={self.instances})"

RandomFewShotGenerator

Source code in flexeval/core/few_shot_generator/rand.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class RandomFewShotGenerator(FewShotGenerator):
    def __init__(
        self,
        dataset: Dataset,
        num_shots: int,
        seed: int = 42,
        num_trials_to_avoid_leak: int = 3,
    ) -> None:
        super().__init__(num_trials_to_avoid_leak=num_trials_to_avoid_leak)

        if num_shots > len(dataset):
            msg = (
                f"`num_shots` should be less than or equal to the number of instances in `dataset`. "
                f"num_shots: {num_shots}, len(dataset): {len(dataset)}"
            )
            raise ValueError(msg)

        self.dataset = dataset
        self.num_shots = num_shots
        self._rnd = random.Random(seed)

    def _sample_instances(self, eval_inputs: list[dict[str, Any]] | dict[str, Any] | None = None) -> list[Instance]:
        sampled_indices = self._rnd.sample(range(len(self.dataset)), self.num_shots)
        return [self.dataset[i] for i in sampled_indices]

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(dataset={self.dataset}, num_shots={self.num_shots})"

dataset instance-attribute

dataset = dataset

num_shots instance-attribute

num_shots = num_shots

__init__

__init__(
    dataset: Dataset,
    num_shots: int,
    seed: int = 42,
    num_trials_to_avoid_leak: int = 3,
) -> None
Source code in flexeval/core/few_shot_generator/rand.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
def __init__(
    self,
    dataset: Dataset,
    num_shots: int,
    seed: int = 42,
    num_trials_to_avoid_leak: int = 3,
) -> None:
    super().__init__(num_trials_to_avoid_leak=num_trials_to_avoid_leak)

    if num_shots > len(dataset):
        msg = (
            f"`num_shots` should be less than or equal to the number of instances in `dataset`. "
            f"num_shots: {num_shots}, len(dataset): {len(dataset)}"
        )
        raise ValueError(msg)

    self.dataset = dataset
    self.num_shots = num_shots
    self._rnd = random.Random(seed)

__repr__

__repr__() -> str
Source code in flexeval/core/few_shot_generator/rand.py
34
35
def __repr__(self) -> str:
    return f"{self.__class__.__name__}(dataset={self.dataset}, num_shots={self.num_shots})"