Skip to content

Samplers

glide.samplers.uniform.UniformSampler

Sampler that draws observations uniformly without replacement from the pool.

It is the standard approach when no auxiliary signal is available.

Examples:

>>> from glide.samplers import UniformSampler
>>> sampler = UniformSampler()
>>> xi = sampler.sample(n_samples=2, budget=1, random_seed=0)
>>> xi
array([0., 1.])
Source code in glide/samplers/uniform.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
class UniformSampler:
    """Sampler that draws observations uniformly without replacement from the pool.

    It is the standard approach when no auxiliary signal is available.

    Examples
    --------
    >>> from glide.samplers import UniformSampler
    >>> sampler = UniformSampler()
    >>> xi = sampler.sample(n_samples=2, budget=1, random_seed=0)
    >>> xi
    array([0., 1.])
    """

    def sample(
        self,
        n_samples: int,
        budget: int,
        random_seed: Optional[Union[int, SeedSequence]] = None,
    ) -> NDArray:
        """Sample observations uniformly at random without replacement.

        Selects exactly ``budget`` observations from a pool of ``n_samples``
        without replacement.

        Parameters
        ----------
        n_samples : int
            Total number of observations in the pool. Must be a strictly
            positive integer.
        budget : int
            Exact number of observations to select. Must be a strictly
            positive integer and must not exceed ``n_samples``.
        random_seed : int or SeedSequence or None, optional
            Random seed passed to ``numpy.random.default_rng`` for
            reproducibility. Pass ``None`` (the default) to use a
            non-deterministic seed.

        Returns
        -------
        NDArray
            Array of shape ``(n_samples,)`` with selection indicators
            (1 if selected for annotation, 0 otherwise).

        Raises
        ------
        ValueError
            If ``n_samples`` or ``budget`` is not a strictly positive integer,
            or if ``budget`` exceeds ``n_samples``.
        """
        _validate_is_integer(n_samples, "n_samples")
        _validate_strictly_positive(n_samples, "n_samples")
        _validate_is_integer(budget, "budget")
        _validate_strictly_positive(budget, "budget")
        if budget > n_samples:
            raise ValueError(f"'budget' must not exceed 'n_samples'; got budget={budget} but n_samples={n_samples}.")

        rng = np.random.default_rng(random_seed)

        selected_indices = rng.choice(n_samples, size=budget, replace=False)
        xi = np.zeros(n_samples)
        xi[selected_indices] = 1.0

        return xi

sample

sample(n_samples, budget, random_seed=None)

Sample observations uniformly at random without replacement.

Selects exactly budget observations from a pool of n_samples without replacement.

Parameters:

Name Type Description Default
n_samples int

Total number of observations in the pool. Must be a strictly positive integer.

required
budget int

Exact number of observations to select. Must be a strictly positive integer and must not exceed n_samples.

required
random_seed int or SeedSequence or None

Random seed passed to numpy.random.default_rng for reproducibility. Pass None (the default) to use a non-deterministic seed.

None

Returns:

Type Description
NDArray

Array of shape (n_samples,) with selection indicators (1 if selected for annotation, 0 otherwise).

Raises:

Type Description
ValueError

If n_samples or budget is not a strictly positive integer, or if budget exceeds n_samples.

Source code in glide/samplers/uniform.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def sample(
    self,
    n_samples: int,
    budget: int,
    random_seed: Optional[Union[int, SeedSequence]] = None,
) -> NDArray:
    """Sample observations uniformly at random without replacement.

    Selects exactly ``budget`` observations from a pool of ``n_samples``
    without replacement.

    Parameters
    ----------
    n_samples : int
        Total number of observations in the pool. Must be a strictly
        positive integer.
    budget : int
        Exact number of observations to select. Must be a strictly
        positive integer and must not exceed ``n_samples``.
    random_seed : int or SeedSequence or None, optional
        Random seed passed to ``numpy.random.default_rng`` for
        reproducibility. Pass ``None`` (the default) to use a
        non-deterministic seed.

    Returns
    -------
    NDArray
        Array of shape ``(n_samples,)`` with selection indicators
        (1 if selected for annotation, 0 otherwise).

    Raises
    ------
    ValueError
        If ``n_samples`` or ``budget`` is not a strictly positive integer,
        or if ``budget`` exceeds ``n_samples``.
    """
    _validate_is_integer(n_samples, "n_samples")
    _validate_strictly_positive(n_samples, "n_samples")
    _validate_is_integer(budget, "budget")
    _validate_strictly_positive(budget, "budget")
    if budget > n_samples:
        raise ValueError(f"'budget' must not exceed 'n_samples'; got budget={budget} but n_samples={n_samples}.")

    rng = np.random.default_rng(random_seed)

    selected_indices = rng.choice(n_samples, size=budget, replace=False)
    xi = np.zeros(n_samples)
    xi[selected_indices] = 1.0

    return xi

glide.samplers.active.ActiveSampler

Sampler that draws elements with probabilities based on uncertainty scores.

Implements active sampling for inference pipelines which support inverse probability weighting (IPW). Each observation is assigned a drawing probability π_i proportional to its uncertainty score, then independently selected via a Bernoulli trial. This concentrates the annotation budget on the most uncertain observations.

References

Zrnic, Tijana, and Emmanuel J. Candès. "Active statistical inference." In Proceedings of the 41st International Conference on Machine Learning, pp. 62993-63010. 2024.

Gligorić, Kristina, Tijana Zrnic, Cinoo Lee, Emmanuel Candes, and Dan Jurafsky. "Can unconfident llm annotations be used for confident conclusions?." In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pp. 3514-3533. 2025.

Examples:

>>> import numpy as np
>>> from glide.samplers import ActiveSampler
>>> uncertainties = np.array([0.1, 0.4])
>>> sampler = ActiveSampler()
>>> pi, xi = sampler.sample(uncertainties, budget=1, random_seed=0)
>>> pi
array([0.2, 0.8])
>>> xi
array([0., 1.])
Source code in glide/samplers/active.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
class ActiveSampler:
    """Sampler that draws elements with probabilities based on uncertainty scores.

    Implements active sampling for inference pipelines which support inverse
    probability weighting (IPW).
    Each observation is assigned a drawing probability π_i proportional to its
    uncertainty score, then independently selected via a Bernoulli trial. This
    concentrates the annotation budget on the most uncertain observations.

    References
    ----------
    Zrnic, Tijana, and Emmanuel J. Candès. "Active statistical inference." In Proceedings
    of the 41st International Conference on Machine Learning, pp. 62993-63010. 2024.

    Gligorić, Kristina, Tijana Zrnic, Cinoo Lee, Emmanuel Candes, and Dan Jurafsky.
    "Can unconfident llm annotations be used for confident conclusions?." In Proceedings
    of the 2025 Conference of the Nations of the Americas Chapter of the Association for
    Computational Linguistics: Human Language Technologies (Volume 1: Long Papers),
    pp. 3514-3533. 2025.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.samplers import ActiveSampler
    >>> uncertainties = np.array([0.1, 0.4])
    >>> sampler = ActiveSampler()
    >>> pi, xi = sampler.sample(uncertainties, budget=1, random_seed=0)
    >>> pi
    array([0.2, 0.8])
    >>> xi
    array([0., 1.])
    """

    def _compute_probabilities(self, uncertainties: NDArray, budget: int) -> NDArray:
        uncertainty_ratio = np.max(uncertainties) / np.min(uncertainties)
        if uncertainty_ratio > 1e3:
            warnings.warn(
                f"Extreme uncertainty ratio detected among samples (max/min={uncertainty_ratio:.2e} > 1e3); "
                "this may cause numerical instability.",
                UserWarning,
            )
        naive_pi = budget * uncertainties / uncertainties.sum()
        if np.max(naive_pi) <= 1.0:
            return naive_pi

        n = len(uncertainties)
        squared_uncertainties = np.power(uncertainties, 2)

        def objective(pi: NDArray) -> float:
            result = np.sum(squared_uncertainties / pi)
            return result

        def jacobian(pi: NDArray) -> NDArray:
            gradient = -squared_uncertainties / np.power(pi, 2)
            return gradient

        bounds = Bounds(lb=np.zeros(n), ub=np.ones(n))
        budget_constraint = LinearConstraint(np.ones((1, n)), lb=budget, ub=budget)
        optimization_result = minimize(
            objective,
            naive_pi,
            method="trust-constr",
            jac=jacobian,
            constraints=[budget_constraint],
            bounds=bounds,
            options={"maxiter": 100},
        )
        result = np.minimum(optimization_result.x, 1.0)
        return result

    def sample(
        self,
        uncertainties: NDArray,
        budget: int,
        random_seed: Optional[Union[int, SeedSequence]] = None,
    ) -> Tuple[NDArray, NDArray]:
        """Sample observations with probability proportional to uncertainty.

        Each observation receives a drawing probability π_i that minimizes the variance
        of downstream IPW-based estimators. This is equivalently done by minimizing the sum of
        ``uncertainty_i^2 / π_i`` over all observations. Probabilities are constrained to
        ``(0, 1]`` and sum to ``budget``. The actual number of selected items is random
        but limited to ``budget``.

        Samples are randomly permuted before drawing and the inverse permutation
        is applied to the output, so the returned arrays are always in the
        original input order. A post-draw cutoff is then applied to strictly
        respect the budget: samples beyond the cutoff are discarded by setting their entries
        in ``pi`` and ``xi`` to ``0.0`` and ``NaN`` respectively.

        The two returned arrays are intended for use with IPW-based downstream estimators.
        ``pi`` holds the per-sample probability of being selected. ``xi`` holds the
        selection indicators for each sample so that a value of 1 means the sample
        should be sent for annotation, a value of 0 means it was not selected, and
        ``NaN`` means it was discarded by the budget cutoff.

        Parameters
        ----------
        uncertainties : NDArray
            Array of shape ``(n_samples,)`` with strictly positive uncertainty scores.
        budget : int
            Expected total number of annotations to collect. Must be a strictly
            positive integer and must not exceed ``len(uncertainties)``.
        random_seed : int or SeedSequence or None, optional
            Random seed passed to ``numpy.random.default_rng`` for
            reproducibility. Pass ``None`` (the default) to use a
            non-deterministic seed.

        Returns
        -------
        Tuple[NDArray, NDArray]
            [0]: array of shape ``(n_samples,)``, ``pi`` with per-sample annotation probabilities
            for selected samples and ``0.0`` for unselected samples.
            [1]: array of shape ``(n_samples,)``, ``xi`` with Bernoulli indicators:
            ``1.0`` if selected for annotation, ``0.0`` if not selected,
            ``NaN`` if excluded by the budget cutoff.

        Raises
        ------
        ValueError
            If ``budget`` is not a strictly positive integer, if ``budget``
            exceeds ``len(uncertainties)``, or if any uncertainty value is NaN,
            zero, or negative.

        Warns
        -----
        UserWarning
            If the ratio of the largest to the smallest uncertainty is extreme,
            indicating potential numerical instability.

        References
        ----------
        Zrnic, Tijana, and Emmanuel J. Candès. "Active statistical inference." In Proceedings
        of the 41st International Conference on Machine Learning, pp. 62993-63010. 2024.
        """
        _validate_is_integer(budget, "budget")
        _validate_strictly_positive(budget, "budget")
        _validate_budget_bound(budget, len(uncertainties))
        _validate_uncertainties(uncertainties)
        pi = self._compute_probabilities(uncertainties, budget)

        rng = np.random.default_rng(random_seed)
        pi_shuffled, order = _shuffle(pi, rng)
        xi_shuffled = rng.binomial(n=1, p=pi_shuffled).astype(float)
        cumulative_costs = np.cumsum(xi_shuffled)
        kept_indices = _compute_cutoff_indices(cumulative_costs, order, budget)
        pi_out, xi_out = _build_output(kept_indices, pi_shuffled, xi_shuffled)
        return pi_out, xi_out

sample

sample(uncertainties, budget, random_seed=None)

Sample observations with probability proportional to uncertainty.

Each observation receives a drawing probability π_i that minimizes the variance of downstream IPW-based estimators. This is equivalently done by minimizing the sum of uncertainty_i^2 / π_i over all observations. Probabilities are constrained to (0, 1] and sum to budget. The actual number of selected items is random but limited to budget.

Samples are randomly permuted before drawing and the inverse permutation is applied to the output, so the returned arrays are always in the original input order. A post-draw cutoff is then applied to strictly respect the budget: samples beyond the cutoff are discarded by setting their entries in pi and xi to 0.0 and NaN respectively.

The two returned arrays are intended for use with IPW-based downstream estimators. pi holds the per-sample probability of being selected. xi holds the selection indicators for each sample so that a value of 1 means the sample should be sent for annotation, a value of 0 means it was not selected, and NaN means it was discarded by the budget cutoff.

Parameters:

Name Type Description Default
uncertainties NDArray

Array of shape (n_samples,) with strictly positive uncertainty scores.

required
budget int

Expected total number of annotations to collect. Must be a strictly positive integer and must not exceed len(uncertainties).

required
random_seed int or SeedSequence or None

Random seed passed to numpy.random.default_rng for reproducibility. Pass None (the default) to use a non-deterministic seed.

None

Returns:

Type Description
Tuple[NDArray, NDArray]

[0]: array of shape (n_samples,), pi with per-sample annotation probabilities for selected samples and 0.0 for unselected samples. [1]: array of shape (n_samples,), xi with Bernoulli indicators: 1.0 if selected for annotation, 0.0 if not selected, NaN if excluded by the budget cutoff.

Raises:

Type Description
ValueError

If budget is not a strictly positive integer, if budget exceeds len(uncertainties), or if any uncertainty value is NaN, zero, or negative.

Warns:

Type Description
UserWarning

If the ratio of the largest to the smallest uncertainty is extreme, indicating potential numerical instability.

References

Zrnic, Tijana, and Emmanuel J. Candès. "Active statistical inference." In Proceedings of the 41st International Conference on Machine Learning, pp. 62993-63010. 2024.

Source code in glide/samplers/active.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def sample(
    self,
    uncertainties: NDArray,
    budget: int,
    random_seed: Optional[Union[int, SeedSequence]] = None,
) -> Tuple[NDArray, NDArray]:
    """Sample observations with probability proportional to uncertainty.

    Each observation receives a drawing probability π_i that minimizes the variance
    of downstream IPW-based estimators. This is equivalently done by minimizing the sum of
    ``uncertainty_i^2 / π_i`` over all observations. Probabilities are constrained to
    ``(0, 1]`` and sum to ``budget``. The actual number of selected items is random
    but limited to ``budget``.

    Samples are randomly permuted before drawing and the inverse permutation
    is applied to the output, so the returned arrays are always in the
    original input order. A post-draw cutoff is then applied to strictly
    respect the budget: samples beyond the cutoff are discarded by setting their entries
    in ``pi`` and ``xi`` to ``0.0`` and ``NaN`` respectively.

    The two returned arrays are intended for use with IPW-based downstream estimators.
    ``pi`` holds the per-sample probability of being selected. ``xi`` holds the
    selection indicators for each sample so that a value of 1 means the sample
    should be sent for annotation, a value of 0 means it was not selected, and
    ``NaN`` means it was discarded by the budget cutoff.

    Parameters
    ----------
    uncertainties : NDArray
        Array of shape ``(n_samples,)`` with strictly positive uncertainty scores.
    budget : int
        Expected total number of annotations to collect. Must be a strictly
        positive integer and must not exceed ``len(uncertainties)``.
    random_seed : int or SeedSequence or None, optional
        Random seed passed to ``numpy.random.default_rng`` for
        reproducibility. Pass ``None`` (the default) to use a
        non-deterministic seed.

    Returns
    -------
    Tuple[NDArray, NDArray]
        [0]: array of shape ``(n_samples,)``, ``pi`` with per-sample annotation probabilities
        for selected samples and ``0.0`` for unselected samples.
        [1]: array of shape ``(n_samples,)``, ``xi`` with Bernoulli indicators:
        ``1.0`` if selected for annotation, ``0.0`` if not selected,
        ``NaN`` if excluded by the budget cutoff.

    Raises
    ------
    ValueError
        If ``budget`` is not a strictly positive integer, if ``budget``
        exceeds ``len(uncertainties)``, or if any uncertainty value is NaN,
        zero, or negative.

    Warns
    -----
    UserWarning
        If the ratio of the largest to the smallest uncertainty is extreme,
        indicating potential numerical instability.

    References
    ----------
    Zrnic, Tijana, and Emmanuel J. Candès. "Active statistical inference." In Proceedings
    of the 41st International Conference on Machine Learning, pp. 62993-63010. 2024.
    """
    _validate_is_integer(budget, "budget")
    _validate_strictly_positive(budget, "budget")
    _validate_budget_bound(budget, len(uncertainties))
    _validate_uncertainties(uncertainties)
    pi = self._compute_probabilities(uncertainties, budget)

    rng = np.random.default_rng(random_seed)
    pi_shuffled, order = _shuffle(pi, rng)
    xi_shuffled = rng.binomial(n=1, p=pi_shuffled).astype(float)
    cumulative_costs = np.cumsum(xi_shuffled)
    kept_indices = _compute_cutoff_indices(cumulative_costs, order, budget)
    pi_out, xi_out = _build_output(kept_indices, pi_shuffled, xi_shuffled)
    return pi_out, xi_out

glide.samplers.stratified.StratifiedSampler

Sampler for per-stratum annotation budget allocation.

This class implements stratified sampling strategies that determine how many samples to annotate in each stratum, given a fixed annotation budget and proxy labels for all samples (provided as numpy arrays). It supports two allocation strategies:

  • Proportional allocation (baseline): Allocates budget proportionally to stratum sizes, resulting in uniform sampling probabilities across the dataset.

  • Neyman allocation (default, optimal): Assigns more budget to strata with higher proxy variance, minimising the asymptotic variance of downstream estimators. Particularly effective when proxy variance varies substantially across strata.

Both allocators use largest-remainder rounding (Hamilton's method) to allocate budget across strata. Per-stratum sample sizes are capped at stratum size, so total allocated budget Σ n_h ≤ budget (may be less if strata are small). The sampler is typically used upstream of statistical estimators to plan annotation effort.

References

Fogliato, Riccardo, Pratik Patil, Mathew Monfort, and Pietro Perona. "A framework for efficient model evaluation through stratification, sampling, and estimation." In European Conference on Computer Vision, pp. 140-158. Cham: Springer Nature Switzerland, 2024.

Examples:

>>> import numpy as np
>>> from glide.samplers import StratifiedSampler
>>> y_proxy = np.array([0.8, 0.9, 0.85, 0.88, 2.4 , 2.5 , 2.45, 2.48])
>>> groups = np.array(["A", "A", "A", "A", "B", "B", "B", "B"], dtype=object)
>>> sampler = StratifiedSampler()
>>> xi = sampler.sample(y_proxy, groups, budget=4, random_seed=1)
>>> xi
array([0, 1, 1, 0, 1, 0, 1, 0])
Source code in glide/samplers/stratified.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
class StratifiedSampler:
    """Sampler for per-stratum annotation budget allocation.

    This class implements stratified sampling strategies that determine how many samples
    to annotate in each stratum, given a fixed annotation budget and proxy labels for
    all samples (provided as numpy arrays). It supports two allocation strategies:

    - **Proportional allocation** (baseline): Allocates budget proportionally to stratum
      sizes, resulting in uniform sampling probabilities across the dataset.

    - **Neyman allocation** (default, optimal): Assigns more budget to strata with higher
      proxy variance, minimising the asymptotic variance of downstream estimators.
      Particularly effective when proxy variance varies substantially across strata.

    Both allocators use largest-remainder rounding (Hamilton's method) to allocate budget
    across strata. Per-stratum sample sizes are capped at stratum size, so total allocated
    budget Σ n_h ≤ budget (may be less if strata are small). The sampler is typically used
    upstream of statistical estimators to plan annotation effort.

    References
    ----------
    Fogliato, Riccardo, Pratik Patil, Mathew Monfort, and Pietro Perona. "A framework
    for efficient model evaluation through stratification, sampling, and estimation."
    In European Conference on Computer Vision, pp. 140-158. Cham: Springer Nature
    Switzerland, 2024.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.samplers import StratifiedSampler
    >>> y_proxy = np.array([0.8, 0.9, 0.85, 0.88, 2.4 , 2.5 , 2.45, 2.48])
    >>> groups = np.array(["A", "A", "A", "A", "B", "B", "B", "B"], dtype=object)
    >>> sampler = StratifiedSampler()
    >>> xi = sampler.sample(y_proxy, groups, budget=4, random_seed=1)
    >>> xi
    array([0, 1, 1, 0, 1, 0, 1, 0])
    """

    def _validate(self, y_proxy: NDArray, groups: NDArray, budget: int, strategy: str) -> None:
        _validate_literal(strategy, "strategy", ["proportional", "neyman"])
        _validate_is_integer(budget, "budget")
        _validate_strictly_positive(budget, "budget")
        _validate_budget_bound(budget, len(y_proxy))
        _validate_y_proxy(y_proxy)
        for stratum_id in np.unique(groups):
            stratum_mask = groups == stratum_id
            _validate_y_proxy(y_proxy[stratum_mask], stratum_id)

    def _apply_largest_remainder_rounding(
        self,
        raw_allocation: Dict[Hashable, float],
        budget: int,
    ) -> Dict[Hashable, int]:
        allocation = {}
        remainders = {}

        for stratum_id, raw_value in raw_allocation.items():
            floor_value = int(np.floor(raw_value))
            allocation[stratum_id] = floor_value
            remainder = raw_value - floor_value
            remainders[stratum_id] = remainder

        current_sum = sum(allocation.values())
        remaining_slots = budget - current_sum

        sorted_strata = sorted(remainders.items(), key=lambda x: x[1], reverse=True)
        for stratum_id, _ in sorted_strata[:remaining_slots]:
            allocation[stratum_id] += 1

        return allocation

    def _neyman_allocation(
        self,
        y_proxy: NDArray,
        groups: NDArray,
        budget: int,
    ) -> Dict[Hashable, int]:

        unique_strata = np.unique(groups)

        weights = {}
        stratum_sizes = {}
        for stratum_id in unique_strata:
            stratum_mask = groups == stratum_id
            stratum_size = stratum_mask.sum()
            stratum_y_proxy = y_proxy[stratum_mask]
            stratum_std = np.std(stratum_y_proxy, ddof=1)
            weight = stratum_size * stratum_std
            weights[stratum_id] = weight
            stratum_sizes[stratum_id] = stratum_size

        total_weight = sum(weights.values())

        raw_allocation = {}
        for stratum_id in unique_strata:
            raw_allocation[stratum_id] = budget * weights[stratum_id] / total_weight

        allocation = self._apply_largest_remainder_rounding(raw_allocation, budget)

        for stratum_id in allocation:
            allocation[stratum_id] = min(allocation[stratum_id], stratum_sizes[stratum_id])

        return allocation

    def _proportional_allocation(
        self,
        groups: NDArray,
        budget: int,
    ) -> Dict[Hashable, int]:

        unique_strata = np.unique(groups)
        total_size = len(groups)

        raw_allocation = {}
        stratum_sizes = {}
        for stratum_id in unique_strata:
            stratum_mask = groups == stratum_id
            stratum_size = stratum_mask.sum()
            raw_allocation[stratum_id] = budget * stratum_size / total_size
            stratum_sizes[stratum_id] = stratum_size

        allocation = self._apply_largest_remainder_rounding(raw_allocation, budget)

        for stratum_id in allocation:
            allocation[stratum_id] = min(allocation[stratum_id], stratum_sizes[stratum_id])

        return allocation

    def sample(
        self,
        y_proxy: NDArray,
        groups: NDArray,
        budget: int,
        strategy: Literal["proportional", "neyman"] = "neyman",
        random_seed: Optional[int] = None,
    ) -> NDArray:
        """Allocate annotation budget across strata and perform stratified sampling.

        Computes allocated annotation counts ``n_h`` for each stratum ``h`` using the
        specified allocation strategy and selects exactly ``n_h`` samples from each stratum
        without replacement. Neyman allocation (default) assigns more budget to strata with higher
        proxy variance, minimising asymptotic variance of downstream estimators. Proportional
        allocation allocates budget proportionally to stratum sizes and serves as a baseline.

        Parameters
        ----------
        y_proxy : NDArray
            Proxy labels for all samples, shape ``(n_samples,)``. Must be 1-dimensional.
        groups : NDArray
            Stratum identifiers for all samples, shape ``(n_samples,)``.
            Must be 1-dimensional with same length as y_proxy.
        budget : int
            Target annotation budget. Must be positive. Mandatory.
        strategy : str, optional
            Allocation strategy: "neyman" (default) or "proportional".
            "neyman": assigns more budget to higher-variance strata.
            "proportional": allocates proportionally to stratum sizes.
        random_seed : int or None, optional
            Random seed for reproducible sampling. Defaults to None (non-deterministic).

        Returns
        -------
        NDArray
            Selection indicators of shape ``(n_samples,)``: 1 if the sample was selected
            for annotation, 0 otherwise.

        Raises
        ------
        ValueError
            - If ``strategy`` is not a recognized allocation strategy.
            - If ``budget`` is not a strictly positive integer.
            - If ``budget`` is too low and results in zero allocations for some stratum.
            - If ``budget`` exceeds the total number of samples in the input.
        """
        self._validate(y_proxy, groups, budget, strategy)

        if strategy == "proportional":
            allocation = self._proportional_allocation(groups, budget)
        else:
            allocation = self._neyman_allocation(y_proxy, groups, budget)

        for stratum_id, n_h in allocation.items():
            if n_h < 2:
                raise ValueError(
                    f"Stratum '{stratum_id}' has fewer than two allocations. All strata must receive at least "
                    f"two annotation slots. Consider increasing the budget or using bigger strata."
                )

            stratum_mask = groups == stratum_id
            stratum_size = stratum_mask.sum()

            if n_h > stratum_size - 2:
                raise ValueError(
                    f"Stratum '{stratum_id}' has been over-allocated. Consider using proportional sampling."
                )

        rng = np.random.default_rng(random_seed)

        xi = np.zeros(len(y_proxy), dtype=int)

        for stratum_id in np.unique(groups):
            stratum_indices = np.flatnonzero(groups == stratum_id)
            n_h = allocation[stratum_id]
            selected_samples = rng.choice(stratum_indices, size=n_h, replace=False)
            xi[selected_samples] = 1

        return xi

sample

sample(
    y_proxy,
    groups,
    budget,
    strategy="neyman",
    random_seed=None,
)

Allocate annotation budget across strata and perform stratified sampling.

Computes allocated annotation counts n_h for each stratum h using the specified allocation strategy and selects exactly n_h samples from each stratum without replacement. Neyman allocation (default) assigns more budget to strata with higher proxy variance, minimising asymptotic variance of downstream estimators. Proportional allocation allocates budget proportionally to stratum sizes and serves as a baseline.

Parameters:

Name Type Description Default
y_proxy NDArray

Proxy labels for all samples, shape (n_samples,). Must be 1-dimensional.

required
groups NDArray

Stratum identifiers for all samples, shape (n_samples,). Must be 1-dimensional with same length as y_proxy.

required
budget int

Target annotation budget. Must be positive. Mandatory.

required
strategy str

Allocation strategy: "neyman" (default) or "proportional". "neyman": assigns more budget to higher-variance strata. "proportional": allocates proportionally to stratum sizes.

'neyman'
random_seed int or None

Random seed for reproducible sampling. Defaults to None (non-deterministic).

None

Returns:

Type Description
NDArray

Selection indicators of shape (n_samples,): 1 if the sample was selected for annotation, 0 otherwise.

Raises:

Type Description
ValueError
  • If strategy is not a recognized allocation strategy.
  • If budget is not a strictly positive integer.
  • If budget is too low and results in zero allocations for some stratum.
  • If budget exceeds the total number of samples in the input.
Source code in glide/samplers/stratified.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def sample(
    self,
    y_proxy: NDArray,
    groups: NDArray,
    budget: int,
    strategy: Literal["proportional", "neyman"] = "neyman",
    random_seed: Optional[int] = None,
) -> NDArray:
    """Allocate annotation budget across strata and perform stratified sampling.

    Computes allocated annotation counts ``n_h`` for each stratum ``h`` using the
    specified allocation strategy and selects exactly ``n_h`` samples from each stratum
    without replacement. Neyman allocation (default) assigns more budget to strata with higher
    proxy variance, minimising asymptotic variance of downstream estimators. Proportional
    allocation allocates budget proportionally to stratum sizes and serves as a baseline.

    Parameters
    ----------
    y_proxy : NDArray
        Proxy labels for all samples, shape ``(n_samples,)``. Must be 1-dimensional.
    groups : NDArray
        Stratum identifiers for all samples, shape ``(n_samples,)``.
        Must be 1-dimensional with same length as y_proxy.
    budget : int
        Target annotation budget. Must be positive. Mandatory.
    strategy : str, optional
        Allocation strategy: "neyman" (default) or "proportional".
        "neyman": assigns more budget to higher-variance strata.
        "proportional": allocates proportionally to stratum sizes.
    random_seed : int or None, optional
        Random seed for reproducible sampling. Defaults to None (non-deterministic).

    Returns
    -------
    NDArray
        Selection indicators of shape ``(n_samples,)``: 1 if the sample was selected
        for annotation, 0 otherwise.

    Raises
    ------
    ValueError
        - If ``strategy`` is not a recognized allocation strategy.
        - If ``budget`` is not a strictly positive integer.
        - If ``budget`` is too low and results in zero allocations for some stratum.
        - If ``budget`` exceeds the total number of samples in the input.
    """
    self._validate(y_proxy, groups, budget, strategy)

    if strategy == "proportional":
        allocation = self._proportional_allocation(groups, budget)
    else:
        allocation = self._neyman_allocation(y_proxy, groups, budget)

    for stratum_id, n_h in allocation.items():
        if n_h < 2:
            raise ValueError(
                f"Stratum '{stratum_id}' has fewer than two allocations. All strata must receive at least "
                f"two annotation slots. Consider increasing the budget or using bigger strata."
            )

        stratum_mask = groups == stratum_id
        stratum_size = stratum_mask.sum()

        if n_h > stratum_size - 2:
            raise ValueError(
                f"Stratum '{stratum_id}' has been over-allocated. Consider using proportional sampling."
            )

    rng = np.random.default_rng(random_seed)

    xi = np.zeros(len(y_proxy), dtype=int)

    for stratum_id in np.unique(groups):
        stratum_indices = np.flatnonzero(groups == stratum_id)
        n_h = allocation[stratum_id]
        selected_samples = rng.choice(stratum_indices, size=n_h, replace=False)
        xi[selected_samples] = 1

    return xi

glide.samplers.cost_optimal_random.CostOptimalRandomSampler

Sampler implementing cost-optimal random annotation.

Implements the optimal random sampling strategy for two-rater annotation, where one rater is expensive (ground truth) and one is cheap (proxy). Determines the optimal probability of requesting the expensive rater based on relative costs and annotation quality differences.

References

Angelopoulos, Anastasios N., Jacob Eisenstein, Jonathan Berant, Alekh Agarwal, and Adam Fisch. "Cost-optimal active ai model evaluation." arXiv preprint arXiv:2506.07949 (2025).

Examples:

>>> import numpy as np
>>> from glide.samplers import CostOptimalRandomSampler
>>> y_true = np.array([1.0, 2.0])
>>> y_proxy = np.array([1.1, 1.9])
>>> sampler = CostOptimalRandomSampler()
>>> sampler = sampler.fit(y_true, y_proxy)
>>> pi, xi = sampler.sample(
...     n_samples=2,
...     y_true_cost=10.0,
...     y_proxy_cost=1.0,
...     budget=15,
...     random_seed=42
... )
>>> pi
array([0.0451754, 0.0451754])
>>> xi
array([0., 0.])
Source code in glide/samplers/cost_optimal_random.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
class CostOptimalRandomSampler:
    """Sampler implementing cost-optimal random annotation.

    Implements the optimal random sampling strategy for two-rater annotation,
    where one rater is expensive (ground truth) and one is cheap (proxy).
    Determines the optimal probability of requesting the expensive rater
    based on relative costs and annotation quality differences.

    References
    ----------
    Angelopoulos, Anastasios N., Jacob Eisenstein, Jonathan Berant, Alekh
    Agarwal, and Adam Fisch. "Cost-optimal active ai model evaluation." arXiv
    preprint arXiv:2506.07949 (2025).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.samplers import CostOptimalRandomSampler
    >>> y_true = np.array([1.0, 2.0])
    >>> y_proxy = np.array([1.1, 1.9])
    >>> sampler = CostOptimalRandomSampler()
    >>> sampler = sampler.fit(y_true, y_proxy)
    >>> pi, xi = sampler.sample(
    ...     n_samples=2,
    ...     y_true_cost=10.0,
    ...     y_proxy_cost=1.0,
    ...     budget=15,
    ...     random_seed=42
    ... )
    >>> pi
    array([0.0451754, 0.0451754])
    >>> xi
    array([0., 0.])
    """

    def fit(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
    ) -> "CostOptimalRandomSampler":
        """Calibrate the sampler by estimating proxy quality and label variance.

        Fits the sampler to a fully-labeled burn-in dataset by computing the mean
        squared error between proxy labels and ground truth labels, as well as the
        variance of ground truth labels. These statistics are used to determine the
        optimal probability of requesting expensive ground truth annotations during
        the sampling phase.

        Parameters
        ----------
        y_true : NDArray
            Ground truth labels, shape (n_samples,). Must not contain
            NaN values.
        y_proxy : NDArray
            Proxy labels, shape (n_samples,). Must not contain NaN values.

        Returns
        -------
        CostOptimalRandomSampler
            Self, to allow method chaining.

        Raises
        ------
        ValueError
            - If either array contains NaN, is empty, or arrays have different lengths.
            - If the variance of ``y_true`` is zero (all labels are identical).
            - If the mean squared error between ``y_true`` and ``y_proxy`` is zero
              (proxy labels match ground truth perfectly). This would lead to zero
              annotation probability making sampling impossible.
        """
        _validate_y_true_burn_in(y_true)
        _validate_equal_lengths(y_true, y_proxy, names=["y_true", "y_proxy"])
        _validate_has_no_nan(y_proxy, "y_proxy")
        _validate_has_no_nan(y_true, "y_true")
        if np.max(np.abs(y_true - y_proxy)) == 0:
            raise ValueError("'y_proxy' predicts 'y_true' perfectly (zero MSE). Annotation probability would be zero")

        y_true_variance = np.var(y_true, ddof=1)
        mean_squared_error = np.mean((y_true - y_proxy) ** 2)

        self._y_true_variance = y_true_variance
        self._mean_squared_error = mean_squared_error
        return self

    def _compute_optimal_probability(
        self,
        y_true_cost: float,
        y_proxy_cost: float,
    ) -> float:
        threshold = self._y_true_variance * y_true_cost / (y_true_cost + y_proxy_cost)
        if self._mean_squared_error >= threshold:
            pi = 1.0
        else:
            ratio = (
                (y_proxy_cost / y_true_cost)
                * self._mean_squared_error
                / (self._y_true_variance - self._mean_squared_error)
            )
            pi = float(np.sqrt(ratio))
        return pi

    def sample(
        self,
        n_samples: int,
        y_true_cost: float,
        y_proxy_cost: float,
        budget: float,
        random_seed: Optional[Union[int, SeedSequence]] = None,
    ) -> Tuple[NDArray, NDArray]:
        """Sample observations with cost-optimal allocation between raters.

        Derives the optimal probability of querying the expensive rater (ground truth)
        based on relative costs and proxy quality.

        Samples are randomly permuted before drawing and the inverse permutation is applied
        to the output, so the returned arrays are always in the original input order. A
        post-draw cutoff is then applied to strictly respect the budget: samples beyond the
        cutoff are discarded by setting their entries in ``pi`` and ``xi`` to ``0.0`` and
        ``NaN`` respectively.

        The two returned arrays are intended for use with IPW-based downstream estimators. ``pi``
        holds the per-sample probability of querying the expensive rater. ``xi`` holds the
        annotation indicators for selected samples, with NaN marking samples excluded by the
        budget cutoff.

        Parameters
        ----------
        n_samples : int
            Total number of candidate samples to draw from. Must be a strictly positive integer.
        y_true_cost : float
            Per-sample cost of the expensive rater (H). Must be strictly positive.
        y_proxy_cost : float
            Per-sample cost of the cheap rater (G). Must be strictly positive.
        budget : float
            Total annotation budget in cost units. Must be at least ``y_true_cost + y_proxy_cost``.
        random_seed : int or SeedSequence or None, optional
            Random seed passed to ``numpy.random.default_rng`` for reproducibility.
            Pass ``None`` (the default) to use a non-deterministic seed.

        Returns
        -------
        Tuple[NDArray, NDArray]
            [0]: array of shape ``(n_samples,)``, ``pi`` with per-sample annotation probabilities
            for selected samples and ``0.0`` for unselected samples.
            [1]: array of shape ``(n_samples,)``, ``xi`` with Bernoulli indicators:
            ``1.0`` if selected for annotation, ``0.0`` if not selected,
            ``NaN`` if excluded by the budget cutoff.

        Raises
        ------
        RuntimeError
            If ``fit()`` has not been called before ``sample()``.
        ValueError
            - If ``n_samples`` is not a strictly positive integer.
            - If ``y_true_cost`` or ``y_proxy_cost`` is not strictly positive.
            - If ``budget < y_true_cost + y_proxy_cost``.
        """
        if not hasattr(self, "_y_true_variance") or not hasattr(self, "_mean_squared_error"):
            raise RuntimeError("Call fit() before sample().")
        _validate_is_integer(n_samples, "n_samples")
        _validate_strictly_positive(n_samples, "n_samples")
        _validate_strictly_positive(y_true_cost, "y_true_cost")
        _validate_strictly_positive(y_proxy_cost, "y_proxy_cost")
        _validate_bounds(
            budget,
            "budget",
            lower=y_true_cost + y_proxy_cost,
            error_message=f"'budget' should be at least {y_true_cost + y_proxy_cost}; got {budget}.",
        )

        pi_opt = self._compute_optimal_probability(y_true_cost, y_proxy_cost)

        pi_all = np.full(n_samples, pi_opt)
        rng = np.random.default_rng(random_seed)
        pi_shuffled, order = _shuffle(pi_all, rng)
        xi_shuffled = rng.binomial(n=1, p=pi_shuffled).astype(float)
        cumulative_costs = np.cumsum(xi_shuffled * y_true_cost + y_proxy_cost)
        kept_indices = _compute_cutoff_indices(cumulative_costs, order, budget)
        pi_out, xi_out = _build_output(kept_indices, pi_shuffled, xi_shuffled)
        return pi_out, xi_out

fit

fit(y_true, y_proxy)

Calibrate the sampler by estimating proxy quality and label variance.

Fits the sampler to a fully-labeled burn-in dataset by computing the mean squared error between proxy labels and ground truth labels, as well as the variance of ground truth labels. These statistics are used to determine the optimal probability of requesting expensive ground truth annotations during the sampling phase.

Parameters:

Name Type Description Default
y_true NDArray

Ground truth labels, shape (n_samples,). Must not contain NaN values.

required
y_proxy NDArray

Proxy labels, shape (n_samples,). Must not contain NaN values.

required

Returns:

Type Description
CostOptimalRandomSampler

Self, to allow method chaining.

Raises:

Type Description
ValueError
  • If either array contains NaN, is empty, or arrays have different lengths.
  • If the variance of y_true is zero (all labels are identical).
  • If the mean squared error between y_true and y_proxy is zero (proxy labels match ground truth perfectly). This would lead to zero annotation probability making sampling impossible.
Source code in glide/samplers/cost_optimal_random.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def fit(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
) -> "CostOptimalRandomSampler":
    """Calibrate the sampler by estimating proxy quality and label variance.

    Fits the sampler to a fully-labeled burn-in dataset by computing the mean
    squared error between proxy labels and ground truth labels, as well as the
    variance of ground truth labels. These statistics are used to determine the
    optimal probability of requesting expensive ground truth annotations during
    the sampling phase.

    Parameters
    ----------
    y_true : NDArray
        Ground truth labels, shape (n_samples,). Must not contain
        NaN values.
    y_proxy : NDArray
        Proxy labels, shape (n_samples,). Must not contain NaN values.

    Returns
    -------
    CostOptimalRandomSampler
        Self, to allow method chaining.

    Raises
    ------
    ValueError
        - If either array contains NaN, is empty, or arrays have different lengths.
        - If the variance of ``y_true`` is zero (all labels are identical).
        - If the mean squared error between ``y_true`` and ``y_proxy`` is zero
          (proxy labels match ground truth perfectly). This would lead to zero
          annotation probability making sampling impossible.
    """
    _validate_y_true_burn_in(y_true)
    _validate_equal_lengths(y_true, y_proxy, names=["y_true", "y_proxy"])
    _validate_has_no_nan(y_proxy, "y_proxy")
    _validate_has_no_nan(y_true, "y_true")
    if np.max(np.abs(y_true - y_proxy)) == 0:
        raise ValueError("'y_proxy' predicts 'y_true' perfectly (zero MSE). Annotation probability would be zero")

    y_true_variance = np.var(y_true, ddof=1)
    mean_squared_error = np.mean((y_true - y_proxy) ** 2)

    self._y_true_variance = y_true_variance
    self._mean_squared_error = mean_squared_error
    return self

sample

sample(
    n_samples,
    y_true_cost,
    y_proxy_cost,
    budget,
    random_seed=None,
)

Sample observations with cost-optimal allocation between raters.

Derives the optimal probability of querying the expensive rater (ground truth) based on relative costs and proxy quality.

Samples are randomly permuted before drawing and the inverse permutation is applied to the output, so the returned arrays are always in the original input order. A post-draw cutoff is then applied to strictly respect the budget: samples beyond the cutoff are discarded by setting their entries in pi and xi to 0.0 and NaN respectively.

The two returned arrays are intended for use with IPW-based downstream estimators. pi holds the per-sample probability of querying the expensive rater. xi holds the annotation indicators for selected samples, with NaN marking samples excluded by the budget cutoff.

Parameters:

Name Type Description Default
n_samples int

Total number of candidate samples to draw from. Must be a strictly positive integer.

required
y_true_cost float

Per-sample cost of the expensive rater (H). Must be strictly positive.

required
y_proxy_cost float

Per-sample cost of the cheap rater (G). Must be strictly positive.

required
budget float

Total annotation budget in cost units. Must be at least y_true_cost + y_proxy_cost.

required
random_seed int or SeedSequence or None

Random seed passed to numpy.random.default_rng for reproducibility. Pass None (the default) to use a non-deterministic seed.

None

Returns:

Type Description
Tuple[NDArray, NDArray]

[0]: array of shape (n_samples,), pi with per-sample annotation probabilities for selected samples and 0.0 for unselected samples. [1]: array of shape (n_samples,), xi with Bernoulli indicators: 1.0 if selected for annotation, 0.0 if not selected, NaN if excluded by the budget cutoff.

Raises:

Type Description
RuntimeError

If fit() has not been called before sample().

ValueError
  • If n_samples is not a strictly positive integer.
  • If y_true_cost or y_proxy_cost is not strictly positive.
  • If budget < y_true_cost + y_proxy_cost.
Source code in glide/samplers/cost_optimal_random.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def sample(
    self,
    n_samples: int,
    y_true_cost: float,
    y_proxy_cost: float,
    budget: float,
    random_seed: Optional[Union[int, SeedSequence]] = None,
) -> Tuple[NDArray, NDArray]:
    """Sample observations with cost-optimal allocation between raters.

    Derives the optimal probability of querying the expensive rater (ground truth)
    based on relative costs and proxy quality.

    Samples are randomly permuted before drawing and the inverse permutation is applied
    to the output, so the returned arrays are always in the original input order. A
    post-draw cutoff is then applied to strictly respect the budget: samples beyond the
    cutoff are discarded by setting their entries in ``pi`` and ``xi`` to ``0.0`` and
    ``NaN`` respectively.

    The two returned arrays are intended for use with IPW-based downstream estimators. ``pi``
    holds the per-sample probability of querying the expensive rater. ``xi`` holds the
    annotation indicators for selected samples, with NaN marking samples excluded by the
    budget cutoff.

    Parameters
    ----------
    n_samples : int
        Total number of candidate samples to draw from. Must be a strictly positive integer.
    y_true_cost : float
        Per-sample cost of the expensive rater (H). Must be strictly positive.
    y_proxy_cost : float
        Per-sample cost of the cheap rater (G). Must be strictly positive.
    budget : float
        Total annotation budget in cost units. Must be at least ``y_true_cost + y_proxy_cost``.
    random_seed : int or SeedSequence or None, optional
        Random seed passed to ``numpy.random.default_rng`` for reproducibility.
        Pass ``None`` (the default) to use a non-deterministic seed.

    Returns
    -------
    Tuple[NDArray, NDArray]
        [0]: array of shape ``(n_samples,)``, ``pi`` with per-sample annotation probabilities
        for selected samples and ``0.0`` for unselected samples.
        [1]: array of shape ``(n_samples,)``, ``xi`` with Bernoulli indicators:
        ``1.0`` if selected for annotation, ``0.0`` if not selected,
        ``NaN`` if excluded by the budget cutoff.

    Raises
    ------
    RuntimeError
        If ``fit()`` has not been called before ``sample()``.
    ValueError
        - If ``n_samples`` is not a strictly positive integer.
        - If ``y_true_cost`` or ``y_proxy_cost`` is not strictly positive.
        - If ``budget < y_true_cost + y_proxy_cost``.
    """
    if not hasattr(self, "_y_true_variance") or not hasattr(self, "_mean_squared_error"):
        raise RuntimeError("Call fit() before sample().")
    _validate_is_integer(n_samples, "n_samples")
    _validate_strictly_positive(n_samples, "n_samples")
    _validate_strictly_positive(y_true_cost, "y_true_cost")
    _validate_strictly_positive(y_proxy_cost, "y_proxy_cost")
    _validate_bounds(
        budget,
        "budget",
        lower=y_true_cost + y_proxy_cost,
        error_message=f"'budget' should be at least {y_true_cost + y_proxy_cost}; got {budget}.",
    )

    pi_opt = self._compute_optimal_probability(y_true_cost, y_proxy_cost)

    pi_all = np.full(n_samples, pi_opt)
    rng = np.random.default_rng(random_seed)
    pi_shuffled, order = _shuffle(pi_all, rng)
    xi_shuffled = rng.binomial(n=1, p=pi_shuffled).astype(float)
    cumulative_costs = np.cumsum(xi_shuffled * y_true_cost + y_proxy_cost)
    kept_indices = _compute_cutoff_indices(cumulative_costs, order, budget)
    pi_out, xi_out = _build_output(kept_indices, pi_shuffled, xi_shuffled)
    return pi_out, xi_out

glide.samplers.cost_optimal.CostOptimalSampler

Sampler that draws elements with optimal probabilities based on uncertainty scores and annotation costs on a limited budget.

Implements a cost-optimal active annotation policy. Each sample is assigned an annotation probability proportional to how unreliable the proxy label is expected to be for that sample, as measured by the caller-supplied per-sample uncertainty scores. Samples with high expected proxy error are more likely to be annotated whereas those with low expected proxy error are less likely to be annotated. This concentrates the annotation budget where it matters most.

The caller provides per-sample uncertainty scores and passes them as a 1D array to sample(). These are treated as oracle root mean square error estimates. This class does not learn those scores internally.

References

Angelopoulos, Anastasios N., Jacob Eisenstein, Jonathan Berant, Alekh Agarwal, and Adam Fisch. "Cost-optimal active ai model evaluation." arXiv preprint arXiv:2506.07949 (2025).

Examples:

>>> import numpy as np
>>> from glide.samplers import CostOptimalSampler
>>> y_true = np.array([1.0, 2.0, 3.0, 4.0])
>>> uncertainties = np.array([0.1, 0.4, 0.1, 0.4])
>>> sampler = CostOptimalSampler().fit(y_true)
>>> pi, xi = sampler.sample(
...     uncertainties,
...     y_true_cost=10.0,
...     y_proxy_cost=1.0,
...     budget=20,
...     random_seed=0
... )
>>> pi
array([0.02514447, 0.10057789, 0.02514447, 0.10057789])
>>> xi
array([0., 0., 0., 1.])
Source code in glide/samplers/cost_optimal.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
class CostOptimalSampler:
    """Sampler that draws elements with optimal probabilities based on uncertainty
    scores and annotation costs on a limited budget.

    Implements a cost-optimal active annotation policy. Each sample is assigned
    an annotation probability proportional to how unreliable the proxy label is
    expected to be for that sample, as measured by the caller-supplied per-sample
    uncertainty scores. Samples with high expected proxy error are more likely to
    be annotated whereas those with low expected proxy error are less likely to be
    annotated. This concentrates the annotation budget where it matters most.

    The caller provides per-sample uncertainty scores and passes them as a
    1D array to ``sample()``. These are treated as oracle root mean square error
    estimates. This class does not learn those scores internally.

    References
    ----------
    Angelopoulos, Anastasios N., Jacob Eisenstein, Jonathan Berant, Alekh
    Agarwal, and Adam Fisch. "Cost-optimal active ai model evaluation." arXiv
    preprint arXiv:2506.07949 (2025).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.samplers import CostOptimalSampler
    >>> y_true = np.array([1.0, 2.0, 3.0, 4.0])
    >>> uncertainties = np.array([0.1, 0.4, 0.1, 0.4])
    >>> sampler = CostOptimalSampler().fit(y_true)
    >>> pi, xi = sampler.sample(
    ...     uncertainties,
    ...     y_true_cost=10.0,
    ...     y_proxy_cost=1.0,
    ...     budget=20,
    ...     random_seed=0
    ... )
    >>> pi
    array([0.02514447, 0.10057789, 0.02514447, 0.10057789])
    >>> xi
    array([0., 0., 0., 1.])
    """

    def fit(self, y_true: NDArray) -> "CostOptimalSampler":
        """Estimate the true label variance from a burn-in dataset.

        The true label variance is computed ahead of active sampling so that
        ``sample()`` can derive the cost-optimal annotation probabilities.

        Parameters
        ----------
        y_true : NDArray
            1D float array of true labels from the burn-in phase. Must not
            contain NaN values.

        Returns
        -------
        CostOptimalSampler
            The fitted sampler (returns ``self`` for method chaining).

        Raises
        ------
        ValueError
            If ``y_true`` is empty, contains NaN, or all labels are identical (zero true label variance).

        """
        _validate_y_true_burn_in(y_true)
        self._y_true_variance = np.var(y_true, ddof=1)
        return self

    def _compute_gamma(
        self,
        tau: float,
        uncertainties: NDArray,
        y_true_cost: float,
        y_proxy_cost: float,
    ) -> float:
        cost_ratio = y_proxy_cost / y_true_cost
        above_mask = uncertainties > tau
        prob_above = np.mean(above_mask)
        e_u_below = np.mean(uncertainties**2 * ~above_mask)
        denominator = max(self._y_true_variance - e_u_below, 0.0)
        if denominator > 0.0:
            gamma_uncapped = np.sqrt((cost_ratio + prob_above) / denominator)
        else:
            gamma_uncapped = float("inf")
        gamma = min(gamma_uncapped, 1.0 / tau)
        return gamma

    def _compute_per_sample_probabilities(
        self,
        tau: float,
        gamma: float,
        uncertainties: NDArray,
    ) -> NDArray:
        probabilities = np.where(uncertainties > tau, 1.0, gamma * uncertainties)
        return probabilities

    def _compute_objective(
        self,
        tau: float,
        uncertainties: NDArray,
        y_true_cost: float,
        y_proxy_cost: float,
    ) -> float:
        gamma = self._compute_gamma(tau, uncertainties, y_true_cost, y_proxy_cost)
        pi_values = self._compute_per_sample_probabilities(tau, gamma, uncertainties)
        mean_pi = np.mean(pi_values)
        cost_term = y_true_cost * mean_pi + y_proxy_cost
        error_term = self._y_true_variance + np.mean(uncertainties**2 * (1.0 / pi_values - 1.0))
        objective = cost_term * error_term
        return objective

    def _find_optimal_threshold(
        self,
        uncertainties: NDArray,
        y_true_cost: float,
        y_proxy_cost: float,
    ) -> float:
        candidates = np.unique(uncertainties)
        if y_proxy_cost == 0:
            # When y_proxy_cost=0, cost_ratio=0 in _compute_gamma, so gamma = sqrt(prob_above / ...).
            # If tau equals the largest uncertainty, no sample exceeds it, so prob_above=0 and gamma=0.
            # gamma=0 makes every pi value 0, which causes division by zero in _compute_objective.
            # Dropping the largest candidate ensures at least one sample always exceeds tau (prob_above > 0).
            candidates = candidates[:-1]
        objectives = [self._compute_objective(tau, uncertainties, y_true_cost, y_proxy_cost) for tau in candidates]
        optimal_tau = candidates[np.argmin(objectives)]
        return optimal_tau

    def sample(
        self,
        uncertainties: NDArray,
        y_true_cost: float,
        y_proxy_cost: float,
        budget: float,
        random_seed: Optional[Union[int, SeedSequence]] = None,
    ) -> Tuple[NDArray, NDArray]:
        """Compute sampling probabilities and draw annotation indicators under the cost
        optimal policy.

        Per-sample annotation probabilities are derived from the supplied uncertainty
        scores (root mean squared errors) and the true label variance estimated by ``fit()``.

        Samples are randomly permuted before drawing and the inverse permutation is applied
        to the output, so the returned arrays are always in the original input order. A
        post-draw cutoff is then applied to strictly respect the budget: samples
        beyond the cutoff are discarded by setting their entries in ``pi`` and ``xi`` to
        ``0.0`` and ``NaN`` respectively.

        The two returned arrays are intended for use with IPW-based downstream estimators. ``pi``
        holds the per-sample probability of querying the expensive rater. ``xi`` holds the
        annotation indicators for selected samples, with NaN marking samples excluded by the
        budget cutoff.

        Parameters
        ----------
        uncertainties : NDArray
            1D float array of shape ``(n_samples,)`` containing the pre-computed per-sample
            root mean squared error of the proxy label. All values must be strictly positive.
        y_true_cost : float
            Cost of one true label. Must be strictly positive.
        y_proxy_cost : float
            Cost of one proxy label. Must be non-negative.
        budget : float
            Total annotation budget in cost units. Must be at least ``y_true_cost + y_proxy_cost``.
        random_seed : int or SeedSequence or None, optional
            Random seed passed to ``numpy.random.default_rng`` for reproducibility.
            Pass ``None`` (the default) to use a non-deterministic seed.

        Returns
        -------
        Tuple[NDArray, NDArray]
            [0]: array of shape ``(n_samples,)``, ``pi`` with per-sample annotation probabilities
            for selected samples and ``0.0`` for unselected samples.
            [1]: array of shape ``(n_samples,)``, ``xi`` with Bernoulli indicators:
            ``1.0`` if selected for annotation, ``0.0`` if not selected,
            ``NaN`` if excluded by the budget cutoff.

        Raises
        ------
        RuntimeError
            If ``fit()`` has not been called before ``sample()``.
        ValueError
            - If ``y_true_cost`` is not strictly positive or ``y_proxy_cost`` is negative.
            - If any uncertainty value is NaN or non-positive.
            - If all uncertainty values are equal and ``y_proxy_cost`` is zero.
            - If ``budget < y_true_cost + y_proxy_cost``.

        """
        if not hasattr(self, "_y_true_variance"):
            raise RuntimeError("Call fit() before sample().")
        _validate_strictly_positive(y_true_cost, "y_true_cost")
        if y_proxy_cost < 0.0:
            raise ValueError(f"'y_proxy_cost' must be non-negative; got {y_proxy_cost}.")
        if y_proxy_cost == 0.0:
            _validate_non_constant(
                uncertainties,
                "All uncertainty values are equal and 'y_proxy_cost' is zero."
                " Provide non-constant uncertainties or set 'y_proxy_cost' to a positive value.",
            )
        _validate_uncertainties(uncertainties)
        _validate_bounds(
            budget,
            "budget",
            lower=y_true_cost + y_proxy_cost,
            error_message=f"'budget' should be at least {y_true_cost + y_proxy_cost}; got {budget}.",
        )

        tau_star = self._find_optimal_threshold(uncertainties, y_true_cost, y_proxy_cost)
        gamma_star = self._compute_gamma(tau_star, uncertainties, y_true_cost, y_proxy_cost)
        pi_all = self._compute_per_sample_probabilities(tau_star, gamma_star, uncertainties)

        rng = np.random.default_rng(random_seed)
        pi_shuffled, order = _shuffle(pi_all, rng)
        xi_shuffled = rng.binomial(n=1, p=pi_shuffled).astype(float)
        cumulative_costs = np.cumsum(xi_shuffled * y_true_cost + y_proxy_cost)
        kept_indices = _compute_cutoff_indices(cumulative_costs, order, budget)
        pi_out, xi_out = _build_output(kept_indices, pi_shuffled, xi_shuffled)
        return pi_out, xi_out

fit

fit(y_true)

Estimate the true label variance from a burn-in dataset.

The true label variance is computed ahead of active sampling so that sample() can derive the cost-optimal annotation probabilities.

Parameters:

Name Type Description Default
y_true NDArray

1D float array of true labels from the burn-in phase. Must not contain NaN values.

required

Returns:

Type Description
CostOptimalSampler

The fitted sampler (returns self for method chaining).

Raises:

Type Description
ValueError

If y_true is empty, contains NaN, or all labels are identical (zero true label variance).

Source code in glide/samplers/cost_optimal.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def fit(self, y_true: NDArray) -> "CostOptimalSampler":
    """Estimate the true label variance from a burn-in dataset.

    The true label variance is computed ahead of active sampling so that
    ``sample()`` can derive the cost-optimal annotation probabilities.

    Parameters
    ----------
    y_true : NDArray
        1D float array of true labels from the burn-in phase. Must not
        contain NaN values.

    Returns
    -------
    CostOptimalSampler
        The fitted sampler (returns ``self`` for method chaining).

    Raises
    ------
    ValueError
        If ``y_true`` is empty, contains NaN, or all labels are identical (zero true label variance).

    """
    _validate_y_true_burn_in(y_true)
    self._y_true_variance = np.var(y_true, ddof=1)
    return self

sample

sample(
    uncertainties,
    y_true_cost,
    y_proxy_cost,
    budget,
    random_seed=None,
)

Compute sampling probabilities and draw annotation indicators under the cost optimal policy.

Per-sample annotation probabilities are derived from the supplied uncertainty scores (root mean squared errors) and the true label variance estimated by fit().

Samples are randomly permuted before drawing and the inverse permutation is applied to the output, so the returned arrays are always in the original input order. A post-draw cutoff is then applied to strictly respect the budget: samples beyond the cutoff are discarded by setting their entries in pi and xi to 0.0 and NaN respectively.

The two returned arrays are intended for use with IPW-based downstream estimators. pi holds the per-sample probability of querying the expensive rater. xi holds the annotation indicators for selected samples, with NaN marking samples excluded by the budget cutoff.

Parameters:

Name Type Description Default
uncertainties NDArray

1D float array of shape (n_samples,) containing the pre-computed per-sample root mean squared error of the proxy label. All values must be strictly positive.

required
y_true_cost float

Cost of one true label. Must be strictly positive.

required
y_proxy_cost float

Cost of one proxy label. Must be non-negative.

required
budget float

Total annotation budget in cost units. Must be at least y_true_cost + y_proxy_cost.

required
random_seed int or SeedSequence or None

Random seed passed to numpy.random.default_rng for reproducibility. Pass None (the default) to use a non-deterministic seed.

None

Returns:

Type Description
Tuple[NDArray, NDArray]

[0]: array of shape (n_samples,), pi with per-sample annotation probabilities for selected samples and 0.0 for unselected samples. [1]: array of shape (n_samples,), xi with Bernoulli indicators: 1.0 if selected for annotation, 0.0 if not selected, NaN if excluded by the budget cutoff.

Raises:

Type Description
RuntimeError

If fit() has not been called before sample().

ValueError
  • If y_true_cost is not strictly positive or y_proxy_cost is negative.
  • If any uncertainty value is NaN or non-positive.
  • If all uncertainty values are equal and y_proxy_cost is zero.
  • If budget < y_true_cost + y_proxy_cost.
Source code in glide/samplers/cost_optimal.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
def sample(
    self,
    uncertainties: NDArray,
    y_true_cost: float,
    y_proxy_cost: float,
    budget: float,
    random_seed: Optional[Union[int, SeedSequence]] = None,
) -> Tuple[NDArray, NDArray]:
    """Compute sampling probabilities and draw annotation indicators under the cost
    optimal policy.

    Per-sample annotation probabilities are derived from the supplied uncertainty
    scores (root mean squared errors) and the true label variance estimated by ``fit()``.

    Samples are randomly permuted before drawing and the inverse permutation is applied
    to the output, so the returned arrays are always in the original input order. A
    post-draw cutoff is then applied to strictly respect the budget: samples
    beyond the cutoff are discarded by setting their entries in ``pi`` and ``xi`` to
    ``0.0`` and ``NaN`` respectively.

    The two returned arrays are intended for use with IPW-based downstream estimators. ``pi``
    holds the per-sample probability of querying the expensive rater. ``xi`` holds the
    annotation indicators for selected samples, with NaN marking samples excluded by the
    budget cutoff.

    Parameters
    ----------
    uncertainties : NDArray
        1D float array of shape ``(n_samples,)`` containing the pre-computed per-sample
        root mean squared error of the proxy label. All values must be strictly positive.
    y_true_cost : float
        Cost of one true label. Must be strictly positive.
    y_proxy_cost : float
        Cost of one proxy label. Must be non-negative.
    budget : float
        Total annotation budget in cost units. Must be at least ``y_true_cost + y_proxy_cost``.
    random_seed : int or SeedSequence or None, optional
        Random seed passed to ``numpy.random.default_rng`` for reproducibility.
        Pass ``None`` (the default) to use a non-deterministic seed.

    Returns
    -------
    Tuple[NDArray, NDArray]
        [0]: array of shape ``(n_samples,)``, ``pi`` with per-sample annotation probabilities
        for selected samples and ``0.0`` for unselected samples.
        [1]: array of shape ``(n_samples,)``, ``xi`` with Bernoulli indicators:
        ``1.0`` if selected for annotation, ``0.0`` if not selected,
        ``NaN`` if excluded by the budget cutoff.

    Raises
    ------
    RuntimeError
        If ``fit()`` has not been called before ``sample()``.
    ValueError
        - If ``y_true_cost`` is not strictly positive or ``y_proxy_cost`` is negative.
        - If any uncertainty value is NaN or non-positive.
        - If all uncertainty values are equal and ``y_proxy_cost`` is zero.
        - If ``budget < y_true_cost + y_proxy_cost``.

    """
    if not hasattr(self, "_y_true_variance"):
        raise RuntimeError("Call fit() before sample().")
    _validate_strictly_positive(y_true_cost, "y_true_cost")
    if y_proxy_cost < 0.0:
        raise ValueError(f"'y_proxy_cost' must be non-negative; got {y_proxy_cost}.")
    if y_proxy_cost == 0.0:
        _validate_non_constant(
            uncertainties,
            "All uncertainty values are equal and 'y_proxy_cost' is zero."
            " Provide non-constant uncertainties or set 'y_proxy_cost' to a positive value.",
        )
    _validate_uncertainties(uncertainties)
    _validate_bounds(
        budget,
        "budget",
        lower=y_true_cost + y_proxy_cost,
        error_message=f"'budget' should be at least {y_true_cost + y_proxy_cost}; got {budget}.",
    )

    tau_star = self._find_optimal_threshold(uncertainties, y_true_cost, y_proxy_cost)
    gamma_star = self._compute_gamma(tau_star, uncertainties, y_true_cost, y_proxy_cost)
    pi_all = self._compute_per_sample_probabilities(tau_star, gamma_star, uncertainties)

    rng = np.random.default_rng(random_seed)
    pi_shuffled, order = _shuffle(pi_all, rng)
    xi_shuffled = rng.binomial(n=1, p=pi_shuffled).astype(float)
    cumulative_costs = np.cumsum(xi_shuffled * y_true_cost + y_proxy_cost)
    kept_indices = _compute_cutoff_indices(cumulative_costs, order, budget)
    pi_out, xi_out = _build_output(kept_indices, pi_shuffled, xi_shuffled)
    return pi_out, xi_out

glide.samplers.cluster.UniformClusterSampler

Sampler that selects entire clusters without replacement using uniform sampling.

Each call to sample draws a fixed number of clusters from the pool of unique cluster labels in clusters, then marks every observation in a selected cluster for annotation. Every cluster has equal probability of being selected, so every individual observation has the same marginal probability of being annotated.

Examples:

>>> import numpy as np
>>> from glide.samplers import UniformClusterSampler
>>> clusters = np.array(["A", "A", "B", "B"], dtype=object)
>>> sampler = UniformClusterSampler()
>>> xi = sampler.sample(clusters, n_clusters=1, random_seed=0)
>>> xi
array([0, 0, 1, 1])
Source code in glide/samplers/cluster.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
class UniformClusterSampler:
    """Sampler that selects entire clusters without replacement using uniform sampling.

    Each call to ``sample`` draws a fixed number of clusters from the pool of unique
    cluster labels in ``clusters``, then marks every observation in a selected cluster
    for annotation. Every cluster has equal probability of being selected, so every
    individual observation has the same marginal probability of being annotated.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.samplers import UniformClusterSampler
    >>> clusters = np.array(["A", "A", "B", "B"], dtype=object)
    >>> sampler = UniformClusterSampler()
    >>> xi = sampler.sample(clusters, n_clusters=1, random_seed=0)
    >>> xi
    array([0, 0, 1, 1])
    """

    def sample(
        self,
        clusters: NDArray,
        n_clusters: int,
        random_seed: Optional[Union[int, SeedSequence]] = None,
    ) -> NDArray:
        """Select entire clusters without replacement.

        Draws ``n_clusters`` clusters from the unique values of ``clusters`` with equal
        probability and returns selection indicators: every observation whose cluster was
        drawn receives a 1, all others receive a 0.

        Parameters
        ----------
        clusters : NDArray
            Array of shape ``(n_samples,)`` with cluster identifiers for all observations.
        n_clusters : int
            Number of clusters to select. Must be a strictly positive integer and must
            not exceed the number of unique clusters in ``clusters``.
        random_seed : int or SeedSequence or None, optional
            Random seed passed to ``numpy.random.default_rng`` for reproducibility.
            Defaults to ``None`` (non-deterministic).

        Returns
        -------
        NDArray
            Selection indicators of shape ``(n_samples,)``: 1 if the observation belongs
            to a selected cluster, 0 otherwise.

        Raises
        ------
        ValueError
            - If ``n_clusters`` is not a strictly positive integer.
            - If ``n_clusters`` exceeds the number of unique clusters in ``clusters``.
        """
        _validate_is_integer(n_clusters, "n_clusters")
        _validate_strictly_positive(n_clusters, "n_clusters")
        unique_clusters = np.unique(clusters)
        n_total_clusters = len(unique_clusters)
        _validate_bounds(
            n_clusters,
            "n_clusters",
            upper=n_total_clusters,
            error_message=f"'n_clusters' must not exceed the number of unique clusters; "
            f"got n_clusters={n_clusters} but there are only {n_total_clusters} unique clusters.",
        )

        rng = np.random.default_rng(random_seed)
        selected_clusters = rng.choice(unique_clusters, size=n_clusters, replace=False)

        xi = np.isin(clusters, selected_clusters).astype(int)
        return xi

sample

sample(clusters, n_clusters, random_seed=None)

Select entire clusters without replacement.

Draws n_clusters clusters from the unique values of clusters with equal probability and returns selection indicators: every observation whose cluster was drawn receives a 1, all others receive a 0.

Parameters:

Name Type Description Default
clusters NDArray

Array of shape (n_samples,) with cluster identifiers for all observations.

required
n_clusters int

Number of clusters to select. Must be a strictly positive integer and must not exceed the number of unique clusters in clusters.

required
random_seed int or SeedSequence or None

Random seed passed to numpy.random.default_rng for reproducibility. Defaults to None (non-deterministic).

None

Returns:

Type Description
NDArray

Selection indicators of shape (n_samples,): 1 if the observation belongs to a selected cluster, 0 otherwise.

Raises:

Type Description
ValueError
  • If n_clusters is not a strictly positive integer.
  • If n_clusters exceeds the number of unique clusters in clusters.
Source code in glide/samplers/cluster.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def sample(
    self,
    clusters: NDArray,
    n_clusters: int,
    random_seed: Optional[Union[int, SeedSequence]] = None,
) -> NDArray:
    """Select entire clusters without replacement.

    Draws ``n_clusters`` clusters from the unique values of ``clusters`` with equal
    probability and returns selection indicators: every observation whose cluster was
    drawn receives a 1, all others receive a 0.

    Parameters
    ----------
    clusters : NDArray
        Array of shape ``(n_samples,)`` with cluster identifiers for all observations.
    n_clusters : int
        Number of clusters to select. Must be a strictly positive integer and must
        not exceed the number of unique clusters in ``clusters``.
    random_seed : int or SeedSequence or None, optional
        Random seed passed to ``numpy.random.default_rng`` for reproducibility.
        Defaults to ``None`` (non-deterministic).

    Returns
    -------
    NDArray
        Selection indicators of shape ``(n_samples,)``: 1 if the observation belongs
        to a selected cluster, 0 otherwise.

    Raises
    ------
    ValueError
        - If ``n_clusters`` is not a strictly positive integer.
        - If ``n_clusters`` exceeds the number of unique clusters in ``clusters``.
    """
    _validate_is_integer(n_clusters, "n_clusters")
    _validate_strictly_positive(n_clusters, "n_clusters")
    unique_clusters = np.unique(clusters)
    n_total_clusters = len(unique_clusters)
    _validate_bounds(
        n_clusters,
        "n_clusters",
        upper=n_total_clusters,
        error_message=f"'n_clusters' must not exceed the number of unique clusters; "
        f"got n_clusters={n_clusters} but there are only {n_total_clusters} unique clusters.",
    )

    rng = np.random.default_rng(random_seed)
    selected_clusters = rng.choice(unique_clusters, size=n_clusters, replace=False)

    xi = np.isin(clusters, selected_clusters).astype(int)
    return xi