Skip to content

Estimators

glide.estimators.classical.ClassicalMeanEstimator

Estimator for population mean using the classical sample mean.

Uses only a single array y to compute the sample mean and its standard error via the Central Limit Theorem. This serves as a baseline that does not require proxy predictions.

Examples:

>>> import numpy as np
>>> from glide.estimators import ClassicalMeanEstimator
>>> y = np.array([5.0, 6.0, 4.0, 7.0])
>>> estimator = ClassicalMeanEstimator()
>>> result = estimator.estimate(y)
>>> print(result)
Metric: Metric
Point Estimate: 5.500
Confidence Interval (95%): [4.235, 6.765]
Estimator : ClassicalMeanEstimator
n: 4
Source code in glide/estimators/classical.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class ClassicalMeanEstimator:
    """Estimator for population mean using the classical sample mean.

    Uses only a single array ``y`` to compute the sample mean and its
    standard error via the Central Limit Theorem. This serves as a baseline
    that does not require proxy predictions.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import ClassicalMeanEstimator
    >>> y = np.array([5.0, 6.0, 4.0, 7.0])
    >>> estimator = ClassicalMeanEstimator()
    >>> result = estimator.estimate(y)
    >>> print(result)
    Metric: Metric
    Point Estimate: 5.500
    Confidence Interval (95%): [4.235, 6.765]
    Estimator : ClassicalMeanEstimator
    n: 4
    """

    def _preprocess(self, y: NDArray) -> NDArray:
        not_nan_mask = ~np.isnan(y)
        y_valid = y[not_nan_mask]
        _validate_min_samples(y_valid, "y")
        return y_valid

    def estimate(
        self,
        y: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
    ) -> ClassicalMeanInferenceResult:
        """Estimate the population mean using the classical sample mean.

        Parameters
        ----------
        y : NDArray
            Array of observations, shape ``(n_samples,)``.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval, e.g. ``0.95``
            for a 95 % CI. Defaults to ``0.95``.

        Returns
        -------
        ClassicalMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"ClassicalMeanEstimator"``), and ``n``
            (number of observations).

        Raises
        ------
        ValueError
            If ``y`` contains fewer than 2 non-NaN values.
        """
        y_valid = self._preprocess(y)
        n_samples = len(y_valid)
        mean = np.mean(y_valid)
        std = np.std(y_valid, ddof=1) / np.sqrt(n_samples)
        ci = CLTConfidenceInterval(
            mean=mean,
            std=std,
            confidence_level=confidence_level,
        )
        result = ClassicalMeanInferenceResult(
            confidence_interval=ci,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n=n_samples,
        )
        return result

estimate

estimate(y, metric_name='Metric', confidence_level=0.95)

Estimate the population mean using the classical sample mean.

Parameters:

Name Type Description Default
y NDArray

Array of observations, shape (n_samples,).

required
metric_name str

Human-readable label for the metric. Defaults to "Metric".

'Metric'
confidence_level float

Target coverage for the confidence interval, e.g. 0.95 for a 95 % CI. Defaults to 0.95.

0.95

Returns:

Type Description
ClassicalMeanInferenceResult

Contains the CLT-based confidence interval, the metric name, the estimator name ("ClassicalMeanEstimator"), and n (number of observations).

Raises:

Type Description
ValueError

If y contains fewer than 2 non-NaN values.

Source code in glide/estimators/classical.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def estimate(
    self,
    y: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
) -> ClassicalMeanInferenceResult:
    """Estimate the population mean using the classical sample mean.

    Parameters
    ----------
    y : NDArray
        Array of observations, shape ``(n_samples,)``.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval, e.g. ``0.95``
        for a 95 % CI. Defaults to ``0.95``.

    Returns
    -------
    ClassicalMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"ClassicalMeanEstimator"``), and ``n``
        (number of observations).

    Raises
    ------
    ValueError
        If ``y`` contains fewer than 2 non-NaN values.
    """
    y_valid = self._preprocess(y)
    n_samples = len(y_valid)
    mean = np.mean(y_valid)
    std = np.std(y_valid, ddof=1) / np.sqrt(n_samples)
    ci = CLTConfidenceInterval(
        mean=mean,
        std=std,
        confidence_level=confidence_level,
    )
    result = ClassicalMeanInferenceResult(
        confidence_interval=ci,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n=n_samples,
    )
    return result

glide.estimators.stratified_classical.StratifiedClassicalMeanEstimator

Stratified classical estimator for population mean.

Extends mean estimation as in ClassicalMeanEstimator to datasets partitioned into strata (e.g. by language, domain, or data source). A per-stratum sample mean and standard error are computed independently, then combined with population-proportional weights.

Examples:

>>> import numpy as np
>>> from glide.estimators import StratifiedClassicalMeanEstimator
>>> y = np.array([1.0, 3.0, 5.0, 7.0])
>>> groups = np.array(["A", "A", "B", "B"])
>>> estimator = StratifiedClassicalMeanEstimator()
>>> result = estimator.estimate(y, groups)
>>> print(result)
Metric: Metric
Point Estimate: 4.000
Confidence Interval (95%): [2.614, 5.386]
Estimator : StratifiedClassicalMeanEstimator
n: 4
Source code in glide/estimators/stratified_classical.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
class StratifiedClassicalMeanEstimator:
    """Stratified classical estimator for population mean.

    Extends mean estimation as in `ClassicalMeanEstimator` to datasets partitioned
    into strata (e.g. by language, domain, or data source). A per-stratum sample
    mean and standard error are computed independently, then combined with
    population-proportional weights.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import StratifiedClassicalMeanEstimator
    >>> y = np.array([1.0, 3.0, 5.0, 7.0])
    >>> groups = np.array(["A", "A", "B", "B"])
    >>> estimator = StratifiedClassicalMeanEstimator()
    >>> result = estimator.estimate(y, groups)
    >>> print(result)
    Metric: Metric
    Point Estimate: 4.000
    Confidence Interval (95%): [2.614, 5.386]
    Estimator : StratifiedClassicalMeanEstimator
    n: 4
    """

    def estimate(
        self,
        y: NDArray,
        groups: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        stratum_weights: Optional[NDArray] = None,
    ) -> ClassicalMeanInferenceResult:
        """Estimate the population mean using stratified classical inference.

        Splits observations by ``groups``, computes a classical sample-mean
        estimate within each stratum, and combines them with stratum weights:

            theta = sum_k  w_k * theta_k
            sigma2 = sum_k  w_k^2 * sigma2_k

        where ``w_k`` is the weight of stratum *k*. By default ``w_k`` is the
        sample fraction ``n_samples_k / n_samples``; pass ``stratum_weights``
        to use a different weighting.

        It is assumed that ``w_k`` reflects the true weight of stratum *k* for
        all *k*.

        Parameters
        ----------
        y : NDArray
            Array of observations.
        groups : NDArray
            Array of group identifiers (same length as ``y``). Unique values
            define the strata.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval, e.g. ``0.95``
            for a 95 % CI. Defaults to ``0.95``.
        stratum_weights : NDArray, optional
            Stratum weights in sorted stratum order. When provided, these
            override the sample-count proportions. Defaults to ``None``
            (infer weights from sample counts).

        Returns
        -------
        ClassicalMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"StratifiedClassicalMeanEstimator"``), and
            ``n`` (total number of samples).

        Raises
        ------
        ValueError
            - If ``groups`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
            - If any stratum contains fewer than 2 non-NaN values.
        """
        _validate_has_no_nan(groups, "groups")
        not_nan_mask = ~np.isnan(y)
        n_samples = np.sum(not_nan_mask)
        weighted_mean = 0.0
        weighted_var = 0.0

        unique_strata = np.unique(groups)
        for i, stratum_id in enumerate(unique_strata):
            stratum_mask = groups == stratum_id
            y_stratum = y[stratum_mask & not_nan_mask]
            _validate_min_samples(y_stratum, "y", stratum_id)

            n_samples_k = len(y_stratum)
            if stratum_weights is not None:
                w_k = stratum_weights[i]
            else:
                w_k = n_samples_k / n_samples
            mean_k = np.mean(y_stratum)
            var_k = np.var(y_stratum, ddof=1) / n_samples_k
            weighted_mean += w_k * mean_k
            weighted_var += w_k**2 * var_k

        std = np.sqrt(weighted_var)
        ci = CLTConfidenceInterval(
            mean=weighted_mean,
            std=std,
            confidence_level=confidence_level,
        )
        result = ClassicalMeanInferenceResult(
            confidence_interval=ci,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n=n_samples,
        )
        return result

estimate

estimate(
    y,
    groups,
    metric_name="Metric",
    confidence_level=0.95,
    stratum_weights=None,
)

Estimate the population mean using stratified classical inference.

Splits observations by groups, computes a classical sample-mean estimate within each stratum, and combines them with stratum weights:

theta = sum_k  w_k * theta_k
sigma2 = sum_k  w_k^2 * sigma2_k

where w_k is the weight of stratum k. By default w_k is the sample fraction n_samples_k / n_samples; pass stratum_weights to use a different weighting.

It is assumed that w_k reflects the true weight of stratum k for all k.

Parameters:

Name Type Description Default
y NDArray

Array of observations.

required
groups NDArray

Array of group identifiers (same length as y). Unique values define the strata.

required
metric_name str

Human-readable label for the metric. Defaults to "Metric".

'Metric'
confidence_level float

Target coverage for the confidence interval, e.g. 0.95 for a 95 % CI. Defaults to 0.95.

0.95
stratum_weights NDArray

Stratum weights in sorted stratum order. When provided, these override the sample-count proportions. Defaults to None (infer weights from sample counts).

None

Returns:

Type Description
ClassicalMeanInferenceResult

Contains the CLT-based confidence interval, the metric name, the estimator name ("StratifiedClassicalMeanEstimator"), and n (total number of samples).

Raises:

Type Description
ValueError
  • If groups contains NaN values (numeric dtype) or None values (non-numeric dtype).
  • If any stratum contains fewer than 2 non-NaN values.
Source code in glide/estimators/stratified_classical.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def estimate(
    self,
    y: NDArray,
    groups: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    stratum_weights: Optional[NDArray] = None,
) -> ClassicalMeanInferenceResult:
    """Estimate the population mean using stratified classical inference.

    Splits observations by ``groups``, computes a classical sample-mean
    estimate within each stratum, and combines them with stratum weights:

        theta = sum_k  w_k * theta_k
        sigma2 = sum_k  w_k^2 * sigma2_k

    where ``w_k`` is the weight of stratum *k*. By default ``w_k`` is the
    sample fraction ``n_samples_k / n_samples``; pass ``stratum_weights``
    to use a different weighting.

    It is assumed that ``w_k`` reflects the true weight of stratum *k* for
    all *k*.

    Parameters
    ----------
    y : NDArray
        Array of observations.
    groups : NDArray
        Array of group identifiers (same length as ``y``). Unique values
        define the strata.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval, e.g. ``0.95``
        for a 95 % CI. Defaults to ``0.95``.
    stratum_weights : NDArray, optional
        Stratum weights in sorted stratum order. When provided, these
        override the sample-count proportions. Defaults to ``None``
        (infer weights from sample counts).

    Returns
    -------
    ClassicalMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"StratifiedClassicalMeanEstimator"``), and
        ``n`` (total number of samples).

    Raises
    ------
    ValueError
        - If ``groups`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
        - If any stratum contains fewer than 2 non-NaN values.
    """
    _validate_has_no_nan(groups, "groups")
    not_nan_mask = ~np.isnan(y)
    n_samples = np.sum(not_nan_mask)
    weighted_mean = 0.0
    weighted_var = 0.0

    unique_strata = np.unique(groups)
    for i, stratum_id in enumerate(unique_strata):
        stratum_mask = groups == stratum_id
        y_stratum = y[stratum_mask & not_nan_mask]
        _validate_min_samples(y_stratum, "y", stratum_id)

        n_samples_k = len(y_stratum)
        if stratum_weights is not None:
            w_k = stratum_weights[i]
        else:
            w_k = n_samples_k / n_samples
        mean_k = np.mean(y_stratum)
        var_k = np.var(y_stratum, ddof=1) / n_samples_k
        weighted_mean += w_k * mean_k
        weighted_var += w_k**2 * var_k

    std = np.sqrt(weighted_var)
    ci = CLTConfidenceInterval(
        mean=weighted_mean,
        std=std,
        confidence_level=confidence_level,
    )
    result = ClassicalMeanInferenceResult(
        confidence_interval=ci,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n=n_samples,
    )
    return result

glide.estimators.ipw_classical.IPWClassicalMeanEstimator

Estimator for population mean using Inverse Probability Weighting (IPW).

Extends the classical sample mean to handle non-uniform sampling. Each observation y_i is reweighted by 1/π_i, where π_i is the pre-determined probability that sample i was selected for labeling. Some values of y_i may be NaN corresponding to unsampled instances.

For the computation to be statistically valid, the sum of π_i should be approximately equal to number of observed elements y_i.

Examples:

>>> import numpy as np
>>> from glide.estimators import IPWClassicalMeanEstimator
>>> y = np.array([5.0, 6.0, 4.0, np.nan, np.nan, np.nan])
>>> pi = np.array([0.2, 0.8, 0.6, 0.6, 0.4, 0.4])
>>> estimator = IPWClassicalMeanEstimator()
>>> result = estimator.estimate(y, pi)
>>> print(result)
Metric: Metric
Point Estimate: 6.528
Confidence Interval (95%): [-1.230, 14.286]
Estimator : IPWClassicalMeanEstimator
n: 3
Source code in glide/estimators/ipw_classical.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class IPWClassicalMeanEstimator:
    """Estimator for population mean using Inverse Probability Weighting (IPW).

    Extends the classical sample mean to handle non-uniform sampling.
    Each observation y_i is reweighted by 1/π_i, where π_i is the
    pre-determined probability that sample i was selected for labeling.
    Some values of y_i may be NaN corresponding to unsampled instances.

    For the computation to be statistically valid, the sum of π_i should be
    approximately equal to number of observed elements y_i.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import IPWClassicalMeanEstimator
    >>> y = np.array([5.0, 6.0, 4.0, np.nan, np.nan, np.nan])
    >>> pi = np.array([0.2, 0.8, 0.6, 0.6, 0.4, 0.4])
    >>> estimator = IPWClassicalMeanEstimator()
    >>> result = estimator.estimate(y, pi)
    >>> print(result)
    Metric: Metric
    Point Estimate: 6.528
    Confidence Interval (95%): [-1.230, 14.286]
    Estimator : IPWClassicalMeanEstimator
    n: 3
    """

    def _preprocess(self, y: NDArray, sampling_probability: NDArray) -> Tuple[NDArray, NDArray]:
        _validate_probabilities(sampling_probability)
        non_zero_pi_mask = _get_non_zero_mask(sampling_probability)
        y_not_nan = ~np.isnan(y)
        _validate_label_prob_consistency(y_not_nan, sampling_probability)
        return y[non_zero_pi_mask], sampling_probability[non_zero_pi_mask]

    def estimate(
        self,
        y: NDArray,
        sampling_probability: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
    ) -> ClassicalMeanInferenceResult:
        """Estimate the population mean using IPW-corrected sample mean.

        Parameters
        ----------
        y : NDArray
            1-D array of observations, may contain unobserved NaN values.
        sampling_probability : NDArray
            1-D array of pre-determined sampling probabilities π_i ∈ [0, 1],
            one per observation. Must have the same length as ``y``.
            Entries with π_i = 0 are excluded from the computation.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.

        Returns
        -------
        ClassicalMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"IPWClassicalMeanEstimator"``), and ``n``
            (number of labeled observations).

        Raises
        ------
        ValueError
            If any value in ``sampling_probability`` is outside of [0, 1].
            If any labeled observation (non-NaN ``y``) has ``sampling_probability`` equal to 0.
        """
        y_non_zero_pi, pi_non_zero = self._preprocess(y, sampling_probability)
        n_labeled = int(np.sum(~np.isnan(y_non_zero_pi)))
        n_samples = len(y_non_zero_pi)
        ipw_weighted_values = np.nan_to_num(y_non_zero_pi, nan=0) / pi_non_zero

        mean = np.mean(ipw_weighted_values)
        std = np.std(ipw_weighted_values, ddof=1) / np.sqrt(n_samples)
        ci = CLTConfidenceInterval(mean=mean, std=std, confidence_level=confidence_level)
        result = ClassicalMeanInferenceResult(
            confidence_interval=ci,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n=n_labeled,
        )
        return result

estimate

estimate(
    y,
    sampling_probability,
    metric_name="Metric",
    confidence_level=0.95,
)

Estimate the population mean using IPW-corrected sample mean.

Parameters:

Name Type Description Default
y NDArray

1-D array of observations, may contain unobserved NaN values.

required
sampling_probability NDArray

1-D array of pre-determined sampling probabilities π_i ∈ [0, 1], one per observation. Must have the same length as y. Entries with π_i = 0 are excluded from the computation.

required
metric_name str

Human-readable label for the metric. Defaults to "Metric".

'Metric'
confidence_level float

Target coverage for the confidence interval. Defaults to 0.95.

0.95

Returns:

Type Description
ClassicalMeanInferenceResult

Contains the CLT-based confidence interval, the metric name, the estimator name ("IPWClassicalMeanEstimator"), and n (number of labeled observations).

Raises:

Type Description
ValueError

If any value in sampling_probability is outside of [0, 1]. If any labeled observation (non-NaN y) has sampling_probability equal to 0.

Source code in glide/estimators/ipw_classical.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def estimate(
    self,
    y: NDArray,
    sampling_probability: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
) -> ClassicalMeanInferenceResult:
    """Estimate the population mean using IPW-corrected sample mean.

    Parameters
    ----------
    y : NDArray
        1-D array of observations, may contain unobserved NaN values.
    sampling_probability : NDArray
        1-D array of pre-determined sampling probabilities π_i ∈ [0, 1],
        one per observation. Must have the same length as ``y``.
        Entries with π_i = 0 are excluded from the computation.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.

    Returns
    -------
    ClassicalMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"IPWClassicalMeanEstimator"``), and ``n``
        (number of labeled observations).

    Raises
    ------
    ValueError
        If any value in ``sampling_probability`` is outside of [0, 1].
        If any labeled observation (non-NaN ``y``) has ``sampling_probability`` equal to 0.
    """
    y_non_zero_pi, pi_non_zero = self._preprocess(y, sampling_probability)
    n_labeled = int(np.sum(~np.isnan(y_non_zero_pi)))
    n_samples = len(y_non_zero_pi)
    ipw_weighted_values = np.nan_to_num(y_non_zero_pi, nan=0) / pi_non_zero

    mean = np.mean(ipw_weighted_values)
    std = np.std(ipw_weighted_values, ddof=1) / np.sqrt(n_samples)
    ci = CLTConfidenceInterval(mean=mean, std=std, confidence_level=confidence_level)
    result = ClassicalMeanInferenceResult(
        confidence_interval=ci,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n=n_labeled,
    )
    return result

glide.estimators.cluster_classical.ClusterClassicalMeanEstimator

Cluster classical estimator for population mean.

Extends mean estimation as in ClassicalMeanEstimator to datasets where observations are grouped into clusters. Each cluster's size-weighted contribution is treated as the sampling unit, which accounts for within-cluster correlation and produces valid confidence intervals under cluster sampling designs.

Examples:

>>> import numpy as np
>>> from glide.estimators import ClusterClassicalMeanEstimator
>>> y = np.array([5.0, 5.0, 7.0, 7.0])
>>> clusters = np.array(["A", "A", "B", "B"])
>>> estimator = ClusterClassicalMeanEstimator()
>>> result = estimator.estimate(y, clusters)
>>> print(result)
Metric: Metric
Point Estimate: 6.000
Confidence Interval (95%): [4.040, 7.960]
Estimator : ClusterClassicalMeanEstimator
n: 4
Source code in glide/estimators/cluster_classical.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class ClusterClassicalMeanEstimator:
    """Cluster classical estimator for population mean.

    Extends mean estimation as in ``ClassicalMeanEstimator`` to datasets where
    observations are grouped into clusters. Each cluster's size-weighted
    contribution is treated as the sampling unit, which accounts for
    within-cluster correlation and produces valid confidence intervals under
    cluster sampling designs.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import ClusterClassicalMeanEstimator
    >>> y = np.array([5.0, 5.0, 7.0, 7.0])
    >>> clusters = np.array(["A", "A", "B", "B"])
    >>> estimator = ClusterClassicalMeanEstimator()
    >>> result = estimator.estimate(y, clusters)
    >>> print(result)
    Metric: Metric
    Point Estimate: 6.000
    Confidence Interval (95%): [4.040, 7.960]
    Estimator : ClusterClassicalMeanEstimator
    n: 4
    """

    def _preprocess(
        self,
        y: NDArray,
        clusters: NDArray,
    ) -> Tuple[NDArray, NDArray, int]:
        _validate_equal_lengths(y, clusters, names=["y", "clusters"])
        _validate_has_no_nan(clusters, "clusters")
        not_nan_mask = ~np.isnan(y)
        y_valid = y[not_nan_mask]
        clusters_valid = clusters[not_nan_mask]

        unique_valid_clusters, cluster_indices = np.unique(clusters_valid, return_inverse=True)
        n_valid_clusters = len(unique_valid_clusters)
        _validate_bounds(
            n_valid_clusters,
            "n_valid_clusters",
            lower=2,
            error_message=f"Need at least 2 clusters with non-NaN observations; got {n_valid_clusters}.",
        )
        return y_valid, cluster_indices, n_valid_clusters

    def estimate(
        self,
        y: NDArray,
        clusters: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
    ) -> ClassicalMeanInferenceResult:
        """Estimate the population mean using the cluster classical estimator.

        Computes within-cluster sums and uses them as sampling units to apply
        the CLT:

            theta = (1 / N) * sum_l u_l
            sigma2 = L * Var(u_l, ddof=1) / N^2

        where ``u_l = sum_{i in l} y_i`` are the cluster sums, ``L`` is the
        number of clusters, and ``N = sum_l n_l`` is the total number of
        observations. NaN values in ``y`` are dropped before making the
        computations. Clusters that contain only NaN are not used.

        Parameters
        ----------
        y : NDArray
            Array of observations, shape ``(n_samples,)``. NaN values are
            treated as missing and dropped.
        clusters : NDArray
            Array of cluster identifiers, shape ``(n_samples,)``.
            Unique values define the clusters.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.

        Returns
        -------
        ClassicalMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"ClusterClassicalMeanEstimator"``), and ``n``
            (total number of non-NaN observations across all clusters).

        Raises
        ------
        ValueError
            - If ``y`` and ``clusters`` do not have the same length.
            - If ``clusters`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
            - If fewer than 2 clusters have at least one non-NaN observation.
        """
        y_valid, cluster_indices, n_valid_clusters = self._preprocess(y, clusters)
        total_size = len(y_valid)

        cluster_sums = np.bincount(cluster_indices, weights=y_valid)

        mean = np.sum(cluster_sums) / total_size
        var = n_valid_clusters * np.var(cluster_sums, ddof=1) / total_size**2
        std = np.sqrt(var)

        ci = CLTConfidenceInterval(
            mean=mean,
            std=std,
            confidence_level=confidence_level,
        )
        result = ClassicalMeanInferenceResult(
            confidence_interval=ci,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n=total_size,
        )
        return result

estimate

estimate(
    y, clusters, metric_name="Metric", confidence_level=0.95
)

Estimate the population mean using the cluster classical estimator.

Computes within-cluster sums and uses them as sampling units to apply the CLT:

theta = (1 / N) * sum_l u_l
sigma2 = L * Var(u_l, ddof=1) / N^2

where u_l = sum_{i in l} y_i are the cluster sums, L is the number of clusters, and N = sum_l n_l is the total number of observations. NaN values in y are dropped before making the computations. Clusters that contain only NaN are not used.

Parameters:

Name Type Description Default
y NDArray

Array of observations, shape (n_samples,). NaN values are treated as missing and dropped.

required
clusters NDArray

Array of cluster identifiers, shape (n_samples,). Unique values define the clusters.

required
metric_name str

Human-readable label for the metric. Defaults to "Metric".

'Metric'
confidence_level float

Target coverage for the confidence interval. Defaults to 0.95.

0.95

Returns:

Type Description
ClassicalMeanInferenceResult

Contains the CLT-based confidence interval, the metric name, the estimator name ("ClusterClassicalMeanEstimator"), and n (total number of non-NaN observations across all clusters).

Raises:

Type Description
ValueError
  • If y and clusters do not have the same length.
  • If clusters contains NaN values (numeric dtype) or None values (non-numeric dtype).
  • If fewer than 2 clusters have at least one non-NaN observation.
Source code in glide/estimators/cluster_classical.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def estimate(
    self,
    y: NDArray,
    clusters: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
) -> ClassicalMeanInferenceResult:
    """Estimate the population mean using the cluster classical estimator.

    Computes within-cluster sums and uses them as sampling units to apply
    the CLT:

        theta = (1 / N) * sum_l u_l
        sigma2 = L * Var(u_l, ddof=1) / N^2

    where ``u_l = sum_{i in l} y_i`` are the cluster sums, ``L`` is the
    number of clusters, and ``N = sum_l n_l`` is the total number of
    observations. NaN values in ``y`` are dropped before making the
    computations. Clusters that contain only NaN are not used.

    Parameters
    ----------
    y : NDArray
        Array of observations, shape ``(n_samples,)``. NaN values are
        treated as missing and dropped.
    clusters : NDArray
        Array of cluster identifiers, shape ``(n_samples,)``.
        Unique values define the clusters.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.

    Returns
    -------
    ClassicalMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"ClusterClassicalMeanEstimator"``), and ``n``
        (total number of non-NaN observations across all clusters).

    Raises
    ------
    ValueError
        - If ``y`` and ``clusters`` do not have the same length.
        - If ``clusters`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
        - If fewer than 2 clusters have at least one non-NaN observation.
    """
    y_valid, cluster_indices, n_valid_clusters = self._preprocess(y, clusters)
    total_size = len(y_valid)

    cluster_sums = np.bincount(cluster_indices, weights=y_valid)

    mean = np.sum(cluster_sums) / total_size
    var = n_valid_clusters * np.var(cluster_sums, ddof=1) / total_size**2
    std = np.sqrt(var)

    ci = CLTConfidenceInterval(
        mean=mean,
        std=std,
        confidence_level=confidence_level,
    )
    result = ClassicalMeanInferenceResult(
        confidence_interval=ci,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n=total_size,
    )
    return result

glide.estimators.ppi.PPIMeanEstimator

Estimator for population mean using Prediction-Powered Inference (PPI).

This class implements the PPI method which combines a small set of labeled samples with a large set of unlabeled samples whose labels are approximated by a proxy model. The method provides consistent estimates even when the proxy is imperfect. An optional power-tuning mode (enabled by default) applies the optimal weight λ from PPI++, ensuring the confidence interval is never wider than the one obtained without the proxy.

References

Angelopoulos, Anastasios N., Stephen Bates, Clara Fannjiang, Michael I. Jordan, and Tijana Zrnic. "Prediction-powered inference." Science 382, no. 6671 (2023): 669-674.

Angelopoulos, Anastasios N., John C. Duchi, and Tijana Zrnic. "PPI++: Efficient prediction-powered inference." arXiv preprint arXiv:2311.01453 (2023).

Examples:

>>> import numpy as np
>>> from glide.estimators import PPIMeanEstimator
>>> y_true = np.array([5.0, 6.0, np.nan, np.nan])
>>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1])
>>> estimator = PPIMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy)
>>> print(result)
Metric: Metric
Point Estimate: 5.618
Confidence Interval (95%): [4.923, 6.312]
Estimator : PPIMeanEstimator
n_true: 2
n_proxy: 4
Effective Sample Size: 3
Source code in glide/estimators/ppi.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class PPIMeanEstimator:
    """Estimator for population mean using Prediction-Powered Inference (PPI).

    This class implements the PPI method which combines a small set of labeled samples
    with a large set of unlabeled samples whose labels are approximated by a proxy model.
    The method provides consistent estimates even when the proxy is imperfect. An optional
    power-tuning mode (enabled by default) applies the optimal weight λ from PPI++,
    ensuring the confidence interval is never wider than the one obtained without the proxy.

    References
    ----------
    Angelopoulos, Anastasios N., Stephen Bates, Clara Fannjiang, Michael I. Jordan, and Tijana
    Zrnic. "Prediction-powered inference." Science 382, no. 6671 (2023): 669-674.

    Angelopoulos, Anastasios N., John C. Duchi, and Tijana Zrnic. "PPI++: Efficient
    prediction-powered inference." arXiv preprint arXiv:2311.01453 (2023).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import PPIMeanEstimator
    >>> y_true = np.array([5.0, 6.0, np.nan, np.nan])
    >>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1])
    >>> estimator = PPIMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy)
    >>> print(result)
    Metric: Metric
    Point Estimate: 5.618
    Confidence Interval (95%): [4.923, 6.312]
    Estimator : PPIMeanEstimator
    n_true: 2
    n_proxy: 4
    Effective Sample Size: 3
    """

    def _preprocess(self, y_true_all: NDArray, y_proxy_all: NDArray) -> Tuple[NDArray, NDArray, NDArray]:
        _validate_equal_lengths(y_true_all, y_proxy_all, names=["y_true", "y_proxy"])
        _validate_y_proxy(y_proxy_all)
        _validate_y_true(y_true_all)
        labeled_mask = ~np.isnan(y_true_all)
        _validate_sample_sizes(labeled_mask)
        y_true = y_true_all[labeled_mask]
        y_proxy_labeled = y_proxy_all[labeled_mask]
        y_proxy_unlabeled = y_proxy_all[~labeled_mask]
        return y_true, y_proxy_labeled, y_proxy_unlabeled

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        power_tuning: bool = True,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using Prediction-Powered Inference (PPI).

        Combines a small set of labeled samples with a large set of unlabeled samples whose
        labels are approximated by a proxy (e.g. a pretrained model). The rectifier
        ``mean(y_true) - λ·mean(y_proxy_labeled)`` corrects the bias of the proxy, yielding
        a consistent estimate even when the proxy is imperfect.

        The weight λ interpolates between relying only on ``y_true`` (λ = 0) and the
        standard PPI estimate that leverages both ``y_true`` ``y_proxy`` with equal weights (λ = 1).
        When ``power_tuning=True`` (default), the optimal λ is computed via the PPI++
        closed-form formula to minimise the confidence interval width. When
        ``power_tuning=False``, λ = 1 and the estimator reduces to the classic PPI estimator.

        Parameters
        ----------
        y_true : NDArray
            Array of labeled observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
        y_proxy : NDArray
            Array of proxy predictions, shape ``(n_samples,)``.
            Must be fully populated (no NaN). Must have nonzero variance.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval, e.g. ``0.95``
            for a 95 % CI. Defaults to ``0.95``.
        power_tuning : bool, optional
            If ``True`` (default), compute the optimal λ via the PPI++ formula
            to minimise CI width. If ``False``, use λ = 1 (classic PPI).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"PPIMeanEstimator"``), and the counts
            ``n_true`` (labeled observations) and ``n_proxy`` (all observations
            with a proxy prediction).

        Raises
        ------
        ValueError
            - If ``y_true`` and ``y_proxy`` have different lengths.
            - If any proxy value is NaN.
            - If all proxy values are identical.
            - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
        """
        y_true_filtered, y_proxy_labeled, y_proxy_unlabeled = self._preprocess(y_true, y_proxy)
        n_labeled, n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
        lambda_ = _compute_tuning_parameter(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, power_tuning)
        mean = _compute_mean_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_)
        std = _compute_std_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_)
        confidence_interval = CLTConfidenceInterval(
            mean=mean,
            std=std,
            confidence_level=confidence_level,
        )
        classical_confidence_interval = ClassicalMeanEstimator().estimate(y_true_filtered).confidence_interval
        effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_labeled,
            n_proxy=n_labeled + n_unlabeled,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    metric_name="Metric",
    confidence_level=0.95,
    power_tuning=True,
)

Estimate the population mean using Prediction-Powered Inference (PPI).

Combines a small set of labeled samples with a large set of unlabeled samples whose labels are approximated by a proxy (e.g. a pretrained model). The rectifier mean(y_true) - λ·mean(y_proxy_labeled) corrects the bias of the proxy, yielding a consistent estimate even when the proxy is imperfect.

The weight λ interpolates between relying only on y_true (λ = 0) and the standard PPI estimate that leverages both y_true y_proxy with equal weights (λ = 1). When power_tuning=True (default), the optimal λ is computed via the PPI++ closed-form formula to minimise the confidence interval width. When power_tuning=False, λ = 1 and the estimator reduces to the classic PPI estimator.

Parameters:

Name Type Description Default
y_true NDArray

Array of labeled observations, shape (n_samples,). Labeled entries are finite; unlabeled entries are np.nan.

required
y_proxy NDArray

Array of proxy predictions, shape (n_samples,). Must be fully populated (no NaN). Must have nonzero variance.

required
metric_name str

Human-readable label for the metric. Defaults to "Metric".

'Metric'
confidence_level float

Target coverage for the confidence interval, e.g. 0.95 for a 95 % CI. Defaults to 0.95.

0.95
power_tuning bool

If True (default), compute the optimal λ via the PPI++ formula to minimise CI width. If False, use λ = 1 (classic PPI).

True

Returns:

Type Description
PredictionPoweredMeanInferenceResult

Contains the CLT-based confidence interval, the metric name, the estimator name ("PPIMeanEstimator"), and the counts n_true (labeled observations) and n_proxy (all observations with a proxy prediction).

Raises:

Type Description
ValueError
  • If y_true and y_proxy have different lengths.
  • If any proxy value is NaN.
  • If all proxy values are identical.
  • If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
Source code in glide/estimators/ppi.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    power_tuning: bool = True,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using Prediction-Powered Inference (PPI).

    Combines a small set of labeled samples with a large set of unlabeled samples whose
    labels are approximated by a proxy (e.g. a pretrained model). The rectifier
    ``mean(y_true) - λ·mean(y_proxy_labeled)`` corrects the bias of the proxy, yielding
    a consistent estimate even when the proxy is imperfect.

    The weight λ interpolates between relying only on ``y_true`` (λ = 0) and the
    standard PPI estimate that leverages both ``y_true`` ``y_proxy`` with equal weights (λ = 1).
    When ``power_tuning=True`` (default), the optimal λ is computed via the PPI++
    closed-form formula to minimise the confidence interval width. When
    ``power_tuning=False``, λ = 1 and the estimator reduces to the classic PPI estimator.

    Parameters
    ----------
    y_true : NDArray
        Array of labeled observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
    y_proxy : NDArray
        Array of proxy predictions, shape ``(n_samples,)``.
        Must be fully populated (no NaN). Must have nonzero variance.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval, e.g. ``0.95``
        for a 95 % CI. Defaults to ``0.95``.
    power_tuning : bool, optional
        If ``True`` (default), compute the optimal λ via the PPI++ formula
        to minimise CI width. If ``False``, use λ = 1 (classic PPI).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"PPIMeanEstimator"``), and the counts
        ``n_true`` (labeled observations) and ``n_proxy`` (all observations
        with a proxy prediction).

    Raises
    ------
    ValueError
        - If ``y_true`` and ``y_proxy`` have different lengths.
        - If any proxy value is NaN.
        - If all proxy values are identical.
        - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
    """
    y_true_filtered, y_proxy_labeled, y_proxy_unlabeled = self._preprocess(y_true, y_proxy)
    n_labeled, n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
    lambda_ = _compute_tuning_parameter(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, power_tuning)
    mean = _compute_mean_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_)
    std = _compute_std_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_)
    confidence_interval = CLTConfidenceInterval(
        mean=mean,
        std=std,
        confidence_level=confidence_level,
    )
    classical_confidence_interval = ClassicalMeanEstimator().estimate(y_true_filtered).confidence_interval
    effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_labeled,
        n_proxy=n_labeled + n_unlabeled,
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.stratified_ppi.StratifiedPPIMeanEstimator

Stratified PPI++ estimator for population mean.

Extends Prediction-Powered Inference to datasets that are naturally partitioned into strata (e.g. by language, domain, or data source). A per-stratum power-tuned lambda is computed independently for each stratum, and the final estimate is a population-proportional weighted average of the per-stratum PPI++ estimates.

This yields narrower confidence intervals than standard PPI++ whenever strata differ in proxy quality or relative size, because the optimal lambda can adapt to each stratum's signal-to-noise ratio.

References

Fisch, Adam, Joshua Maynez, R. Alex Hofer, Bhuwan Dhingra, Amir Globerson, and William W. Cohen. "Stratified prediction-powered inference for effective hybrid evaluation of language models." Advances in Neural Information Processing Systems 37 (2024): 111489-111514.

Fogliato, Riccardo, Pratik Patil, Mathew Monfort, and Pietro Perona. "A framework for efficient model evaluation through stratification, sampling, and estimation." In European Conference on Computer Vision, pp. 140-158. Cham: Springer Nature Switzerland, 2024.

Examples:

>>> import numpy as np
>>> from glide.estimators import StratifiedPPIMeanEstimator
>>> y_true = np.array([1.0, 2.0, np.nan, np.nan, 4.0, 5.0, np.nan, np.nan])
>>> y_proxy = np.array([1.1, 2.2, 1.5, 1.8, 3.9, 5.1, 4.5, 4.8])
>>> groups = np.array([0, 0, 0, 0, 1, 1, 1, 1])
>>> estimator = StratifiedPPIMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, groups)
>>> print(result)
Metric: Metric
Point Estimate: 3.086
Confidence Interval (95%): [2.720, 3.452]
Estimator : StratifiedPPIMeanEstimator
n_true: 4
n_proxy: 8
Effective Sample Size: 14
Source code in glide/estimators/stratified_ppi.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
class StratifiedPPIMeanEstimator:
    """Stratified PPI++ estimator for population mean.

    Extends Prediction-Powered Inference to datasets that are naturally partitioned
    into strata (e.g. by language, domain, or data source). A per-stratum power-tuned
    lambda is computed independently for each stratum, and the final estimate is a
    population-proportional weighted average of the per-stratum PPI++ estimates.

    This yields narrower confidence intervals than standard PPI++ whenever strata differ
    in proxy quality or relative size, because the optimal lambda can adapt to each
    stratum's signal-to-noise ratio.

    References
    ----------
    Fisch, Adam, Joshua Maynez, R. Alex Hofer, Bhuwan Dhingra, Amir Globerson, and
    William W. Cohen. "Stratified prediction-powered inference for effective hybrid
    evaluation of language models." Advances in Neural Information Processing
    Systems 37 (2024): 111489-111514.

    Fogliato, Riccardo, Pratik Patil, Mathew Monfort, and Pietro Perona. "A framework
    for efficient model evaluation through stratification, sampling, and estimation."
    In European Conference on Computer Vision, pp. 140-158. Cham: Springer Nature
    Switzerland, 2024.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import StratifiedPPIMeanEstimator
    >>> y_true = np.array([1.0, 2.0, np.nan, np.nan, 4.0, 5.0, np.nan, np.nan])
    >>> y_proxy = np.array([1.1, 2.2, 1.5, 1.8, 3.9, 5.1, 4.5, 4.8])
    >>> groups = np.array([0, 0, 0, 0, 1, 1, 1, 1])
    >>> estimator = StratifiedPPIMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, groups)
    >>> print(result)
    Metric: Metric
    Point Estimate: 3.086
    Confidence Interval (95%): [2.720, 3.452]
    Estimator : StratifiedPPIMeanEstimator
    n_true: 4
    n_proxy: 8
    Effective Sample Size: 14
    """

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        groups: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        power_tuning: bool = True,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using Stratified PPI++.

        Splits arrays by unique values in ``groups``, computes a power-tuned PPI++
        estimate within each stratum, and combines them with
        population-proportional weights:

            theta = sum_k  w_k * theta_k(lambda_k)
            sigma2 = sum_k  w_k^2 * sigma2_k(lambda_k)

        where ``w_k`` is the fraction of samples in stratum *k*.

        Note that this assumes the portions of labeled vs unlabeled samples are
        approximately the same in all strata which is important for statistical
        validity.

        Labeled and unlabeled samples are distinguished by ``NaN`` in ``y_true``:
        a sample is labeled if its ``y_true`` entry is not ``NaN``.

        Parameters
        ----------
        y_true : NDArray
            Array of observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
        y_proxy : NDArray
            Array of proxy predictions, shape ``(n_samples,)``.
            Must be fully populated (no NaN). Must have nonzero variance.
        groups : NDArray
            Array of integer stratum identifiers, shape ``(n_samples,)``. Unique
            values define the strata.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        power_tuning : bool, optional
            If ``True`` (default), compute the optimal ``lambda_k`` per stratum
            via the PPI++ formula. If ``False``, use ``lambda_k = 1.0`` for all
            strata (classic PPI).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"StratifiedPPIMeanEstimator"``), and the counts
            ``n_true`` (total labeled rows) and ``n_proxy`` (total dataset size).

        Raises
        ------
        ValueError
            - If ``y_true``, ``y_proxy``, and ``groups`` do not all have the same length.
            - If any proxy value is NaN.
            - If all proxy values within a stratum are identical.
            - If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.
        """
        strata = _preprocess(y_true, y_proxy, groups)

        weighted_mean = 0.0
        weighted_var = 0.0
        n_samples = len(y_true)

        for y_true_filtered, y_proxy_labeled, y_proxy_unlabeled in strata:
            stratum_size = len(y_true_filtered) + len(y_proxy_unlabeled)
            w_k = stratum_size / n_samples

            lambda_k = _compute_tuning_parameter(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, power_tuning)
            mean_k = _compute_mean_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_k)
            std_k = _compute_std_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_k)

            weighted_mean += w_k * mean_k
            weighted_var += w_k**2 * std_k**2

        std = np.sqrt(weighted_var)
        n_true = int(np.sum(~np.isnan(y_true)))

        confidence_interval = CLTConfidenceInterval(
            mean=weighted_mean,
            std=std,
            confidence_level=confidence_level,
        )
        _, stratum_counts = np.unique(groups, return_counts=True)
        stratum_weights = stratum_counts / n_samples
        classical_confidence_interval = (
            StratifiedClassicalMeanEstimator()
            .estimate(y_true, groups, stratum_weights=stratum_weights)
            .confidence_interval
        )
        effective_sample_size = floor(n_true * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_true,
            n_proxy=n_samples,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    groups,
    metric_name="Metric",
    confidence_level=0.95,
    power_tuning=True,
)

Estimate the population mean using Stratified PPI++.

Splits arrays by unique values in groups, computes a power-tuned PPI++ estimate within each stratum, and combines them with population-proportional weights:

theta = sum_k  w_k * theta_k(lambda_k)
sigma2 = sum_k  w_k^2 * sigma2_k(lambda_k)

where w_k is the fraction of samples in stratum k.

Note that this assumes the portions of labeled vs unlabeled samples are approximately the same in all strata which is important for statistical validity.

Labeled and unlabeled samples are distinguished by NaN in y_true: a sample is labeled if its y_true entry is not NaN.

Parameters:

Name Type Description Default
y_true NDArray

Array of observations, shape (n_samples,). Labeled entries are finite; unlabeled entries are np.nan.

required
y_proxy NDArray

Array of proxy predictions, shape (n_samples,). Must be fully populated (no NaN). Must have nonzero variance.

required
groups NDArray

Array of integer stratum identifiers, shape (n_samples,). Unique values define the strata.

required
metric_name str

Human-readable label for the metric. Defaults to "Metric".

'Metric'
confidence_level float

Target coverage for the confidence interval. Defaults to 0.95.

0.95
power_tuning bool

If True (default), compute the optimal lambda_k per stratum via the PPI++ formula. If False, use lambda_k = 1.0 for all strata (classic PPI).

True

Returns:

Type Description
PredictionPoweredMeanInferenceResult

Contains the CLT-based confidence interval, the metric name, the estimator name ("StratifiedPPIMeanEstimator"), and the counts n_true (total labeled rows) and n_proxy (total dataset size).

Raises:

Type Description
ValueError
  • If y_true, y_proxy, and groups do not all have the same length.
  • If any proxy value is NaN.
  • If all proxy values within a stratum are identical.
  • If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.
Source code in glide/estimators/stratified_ppi.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    groups: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    power_tuning: bool = True,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using Stratified PPI++.

    Splits arrays by unique values in ``groups``, computes a power-tuned PPI++
    estimate within each stratum, and combines them with
    population-proportional weights:

        theta = sum_k  w_k * theta_k(lambda_k)
        sigma2 = sum_k  w_k^2 * sigma2_k(lambda_k)

    where ``w_k`` is the fraction of samples in stratum *k*.

    Note that this assumes the portions of labeled vs unlabeled samples are
    approximately the same in all strata which is important for statistical
    validity.

    Labeled and unlabeled samples are distinguished by ``NaN`` in ``y_true``:
    a sample is labeled if its ``y_true`` entry is not ``NaN``.

    Parameters
    ----------
    y_true : NDArray
        Array of observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
    y_proxy : NDArray
        Array of proxy predictions, shape ``(n_samples,)``.
        Must be fully populated (no NaN). Must have nonzero variance.
    groups : NDArray
        Array of integer stratum identifiers, shape ``(n_samples,)``. Unique
        values define the strata.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    power_tuning : bool, optional
        If ``True`` (default), compute the optimal ``lambda_k`` per stratum
        via the PPI++ formula. If ``False``, use ``lambda_k = 1.0`` for all
        strata (classic PPI).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"StratifiedPPIMeanEstimator"``), and the counts
        ``n_true`` (total labeled rows) and ``n_proxy`` (total dataset size).

    Raises
    ------
    ValueError
        - If ``y_true``, ``y_proxy``, and ``groups`` do not all have the same length.
        - If any proxy value is NaN.
        - If all proxy values within a stratum are identical.
        - If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.
    """
    strata = _preprocess(y_true, y_proxy, groups)

    weighted_mean = 0.0
    weighted_var = 0.0
    n_samples = len(y_true)

    for y_true_filtered, y_proxy_labeled, y_proxy_unlabeled in strata:
        stratum_size = len(y_true_filtered) + len(y_proxy_unlabeled)
        w_k = stratum_size / n_samples

        lambda_k = _compute_tuning_parameter(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, power_tuning)
        mean_k = _compute_mean_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_k)
        std_k = _compute_std_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_k)

        weighted_mean += w_k * mean_k
        weighted_var += w_k**2 * std_k**2

    std = np.sqrt(weighted_var)
    n_true = int(np.sum(~np.isnan(y_true)))

    confidence_interval = CLTConfidenceInterval(
        mean=weighted_mean,
        std=std,
        confidence_level=confidence_level,
    )
    _, stratum_counts = np.unique(groups, return_counts=True)
    stratum_weights = stratum_counts / n_samples
    classical_confidence_interval = (
        StratifiedClassicalMeanEstimator()
        .estimate(y_true, groups, stratum_weights=stratum_weights)
        .confidence_interval
    )
    effective_sample_size = floor(n_true * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_true,
        n_proxy=n_samples,
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.asi.ASIMeanEstimator

Estimator for population mean using Active Statistical Inference (ASI).

This class implements the ASI method which extends PPI++ to non-uniform sampling. Each labeled sample has a known, pre-determined sampling probability π_i. Inverse probability weighting (IPW) corrects for this non-uniform selection, yielding valid confidence intervals under any sampling rule.

The special case where all π_i are equal to n_labeled / n recovers PPI++ at λ = 1.

References

Zrnic, Tijana, and Emmanuel J. Candès. "Active statistical inference." In Proceedings of the 41st International Conference on Machine Learning, pp. 62993-63010. 2024.

Gligorić, Kristina, Tijana Zrnic, Cinoo Lee, Emmanuel Candes, and Dan Jurafsky. "Can unconfident llm annotations be used for confident conclusions?." In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pp. 3514-3533. 2025.

Examples:

>>> import numpy as np
>>> from glide.estimators import ASIMeanEstimator
>>> y_true = np.array([0.0, 1.0, np.nan, np.nan])
>>> y_proxy = np.array([0.1, 0.9, 0.5, 0.5])
>>> pi = np.array([0.8, 0.8, 0.8, 0.8])
>>> estimator = ASIMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, pi)
>>> print(result)
Metric: Metric
Point Estimate: 0.548
Confidence Interval (95%): [0.138, 0.958]
Estimator : ASIMeanEstimator
n_true: 2
n_proxy: 4
Effective Sample Size: 4
Source code in glide/estimators/asi.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
class ASIMeanEstimator:
    """Estimator for population mean using Active Statistical Inference (ASI).

    This class implements the ASI method which extends PPI++ to non-uniform sampling.
    Each labeled sample has a known, pre-determined sampling probability π_i. Inverse
    probability weighting (IPW) corrects for this non-uniform selection, yielding valid
    confidence intervals under any sampling rule.

    The special case where all π_i are equal to n_labeled / n recovers PPI++ at λ = 1.

    References
    ----------
    Zrnic, Tijana, and Emmanuel J. Candès. "Active statistical inference." In Proceedings
    of the 41st International Conference on Machine Learning, pp. 62993-63010. 2024.

    Gligorić, Kristina, Tijana Zrnic, Cinoo Lee, Emmanuel Candes, and Dan Jurafsky.
    "Can unconfident llm annotations be used for confident conclusions?." In Proceedings
    of the 2025 Conference of the Nations of the Americas Chapter of the Association for
    Computational Linguistics: Human Language Technologies (Volume 1: Long Papers),
    pp. 3514-3533. 2025.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import ASIMeanEstimator
    >>> y_true = np.array([0.0, 1.0, np.nan, np.nan])
    >>> y_proxy = np.array([0.1, 0.9, 0.5, 0.5])
    >>> pi = np.array([0.8, 0.8, 0.8, 0.8])
    >>> estimator = ASIMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, pi)
    >>> print(result)
    Metric: Metric
    Point Estimate: 0.548
    Confidence Interval (95%): [0.138, 0.958]
    Estimator : ASIMeanEstimator
    n_true: 2
    n_proxy: 4
    Effective Sample Size: 4
    """

    def _preprocess(
        self,
        y_true_all: NDArray,
        y_proxy: NDArray,
        pi: NDArray,
    ) -> Tuple[NDArray, NDArray, NDArray, NDArray]:
        _validate_equal_lengths(y_true_all, y_proxy, pi, names=["y_true", "y_proxy", "pi"])
        _validate_y_proxy(y_proxy)
        _validate_probabilities(pi)
        y_true_non_nan_mask = ~np.isnan(y_true_all)
        _validate_label_prob_consistency(y_true_non_nan_mask, pi)
        xi = y_true_non_nan_mask.astype(float)

        non_zero_mask = _get_non_zero_mask(pi)
        y_true_all_filtered = y_true_all[non_zero_mask]
        y_proxy_filtered = y_proxy[non_zero_mask]
        pi_filtered = pi[non_zero_mask]
        xi_filtered = xi[non_zero_mask]

        _validate_non_constant(
            y_proxy_filtered * (xi_filtered / pi_filtered - 1),
            "'y_proxy' values lead to constant rectifiers.",
        )

        y_true_filled = np.nan_to_num(y_true_all_filtered, nan=0)
        return y_true_filled, y_proxy_filtered, xi_filtered, pi_filtered

    def _compute_tuning_parameter(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        xi: NDArray,
        pi: NDArray,
        power_tuning: bool,
    ) -> float:
        if not power_tuning:
            return 1.0
        a = y_proxy * (xi / pi - 1)
        b = y_true * xi / pi
        cov_matrix = np.cov(a, b, ddof=1)
        var, cov = cov_matrix[0]
        _lambda = cov / var
        return _lambda

    def _compute_rectified_labels(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        xi: NDArray,
        pi: NDArray,
        _lambda: float,
    ) -> NDArray:
        rectified_labels = _lambda * y_proxy + xi * (y_true - _lambda * y_proxy) / pi
        return rectified_labels

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        pi: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        power_tuning: bool = True,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using Active Statistical Inference (ASI).

        Uses inverse-probability weighting (IPW) to correct for non-uniform sampling,
        combining labeled and unlabeled samples into a single IPW-corrected estimator.
        A power-tuning step (enabled by default) finds the λ that minimises asymptotic
        variance.

        Parameters
        ----------
        y_true : NDArray
            Array of shape ``(n_samples,)`` with ground-truth labels. Use ``np.nan`` for
            unlabeled samples (ξ_i = 0); non-NaN entries are treated as labeled (ξ_i = 1).
        y_proxy : NDArray
            Array of shape ``(n_samples,)`` with proxy predictions. Must be present for every
            sample and must not contain NaN.
        pi : NDArray
            Array of shape ``(n_samples,)`` with the pre-determined sampling probability
            π_i ∈ [0, 1] for each sample. Entries with π_i = 0 are excluded from all
            computations.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        power_tuning : bool, optional
            If ``True`` (default), selects λ analytically to minimise asymptotic variance.
            If ``False``, uses λ = 1 (plain IPW estimator).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains a ``CLTConfidenceInterval``, metric name, estimator
            name (``"ASIMeanEstimator"``), and counts ``n_true`` (labeled samples) and
            ``n_proxy`` (total samples).

        Raises
        ------
        ValueError
            - If ``y_true``, ``y_proxy``, and ``pi`` do not all have the same length.
            - If any proxy value is NaN.
            - If the rectifiers ``y_proxy * (ξ_i / π_i - 1)`` are constant.
            - If any value in ``pi`` is not in [0, 1].
        """
        y_true_filled, y_proxy_filtered, xi, pi_filtered = self._preprocess(y_true, y_proxy, pi)

        n_true = int(xi.sum())
        n_proxy = len(pi_filtered)

        _lambda = self._compute_tuning_parameter(y_true_filled, y_proxy_filtered, xi, pi_filtered, power_tuning)
        rectified_labels = self._compute_rectified_labels(y_true_filled, y_proxy_filtered, xi, pi_filtered, _lambda)
        mean_estimate = np.mean(rectified_labels)
        std_estimate = np.std(rectified_labels, ddof=1) / np.sqrt(n_proxy)

        confidence_interval = CLTConfidenceInterval(
            mean=mean_estimate, std=std_estimate, confidence_level=confidence_level
        )
        classical_confidence_interval = IPWClassicalMeanEstimator().estimate(y_true, pi).confidence_interval
        effective_sample_size = floor(n_true * classical_confidence_interval.var / confidence_interval.var)

        return PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_true,
            n_proxy=n_proxy,
            effective_sample_size=effective_sample_size,
        )

estimate

estimate(
    y_true,
    y_proxy,
    pi,
    metric_name="Metric",
    confidence_level=0.95,
    power_tuning=True,
)

Estimate the population mean using Active Statistical Inference (ASI).

Uses inverse-probability weighting (IPW) to correct for non-uniform sampling, combining labeled and unlabeled samples into a single IPW-corrected estimator. A power-tuning step (enabled by default) finds the λ that minimises asymptotic variance.

Parameters:

Name Type Description Default
y_true NDArray

Array of shape (n_samples,) with ground-truth labels. Use np.nan for unlabeled samples (ξ_i = 0); non-NaN entries are treated as labeled (ξ_i = 1).

required
y_proxy NDArray

Array of shape (n_samples,) with proxy predictions. Must be present for every sample and must not contain NaN.

required
pi NDArray

Array of shape (n_samples,) with the pre-determined sampling probability π_i ∈ [0, 1] for each sample. Entries with π_i = 0 are excluded from all computations.

required
metric_name str

Human-readable label for the metric. Defaults to "Metric".

'Metric'
confidence_level float

Target coverage for the confidence interval. Defaults to 0.95.

0.95
power_tuning bool

If True (default), selects λ analytically to minimise asymptotic variance. If False, uses λ = 1 (plain IPW estimator).

True

Returns:

Type Description
PredictionPoweredMeanInferenceResult

Contains a CLTConfidenceInterval, metric name, estimator name ("ASIMeanEstimator"), and counts n_true (labeled samples) and n_proxy (total samples).

Raises:

Type Description
ValueError
  • If y_true, y_proxy, and pi do not all have the same length.
  • If any proxy value is NaN.
  • If the rectifiers y_proxy * (ξ_i / Ï€_i - 1) are constant.
  • If any value in pi is not in [0, 1].
Source code in glide/estimators/asi.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    pi: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    power_tuning: bool = True,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using Active Statistical Inference (ASI).

    Uses inverse-probability weighting (IPW) to correct for non-uniform sampling,
    combining labeled and unlabeled samples into a single IPW-corrected estimator.
    A power-tuning step (enabled by default) finds the λ that minimises asymptotic
    variance.

    Parameters
    ----------
    y_true : NDArray
        Array of shape ``(n_samples,)`` with ground-truth labels. Use ``np.nan`` for
        unlabeled samples (ξ_i = 0); non-NaN entries are treated as labeled (ξ_i = 1).
    y_proxy : NDArray
        Array of shape ``(n_samples,)`` with proxy predictions. Must be present for every
        sample and must not contain NaN.
    pi : NDArray
        Array of shape ``(n_samples,)`` with the pre-determined sampling probability
        π_i ∈ [0, 1] for each sample. Entries with π_i = 0 are excluded from all
        computations.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    power_tuning : bool, optional
        If ``True`` (default), selects λ analytically to minimise asymptotic variance.
        If ``False``, uses λ = 1 (plain IPW estimator).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains a ``CLTConfidenceInterval``, metric name, estimator
        name (``"ASIMeanEstimator"``), and counts ``n_true`` (labeled samples) and
        ``n_proxy`` (total samples).

    Raises
    ------
    ValueError
        - If ``y_true``, ``y_proxy``, and ``pi`` do not all have the same length.
        - If any proxy value is NaN.
        - If the rectifiers ``y_proxy * (ξ_i / π_i - 1)`` are constant.
        - If any value in ``pi`` is not in [0, 1].
    """
    y_true_filled, y_proxy_filtered, xi, pi_filtered = self._preprocess(y_true, y_proxy, pi)

    n_true = int(xi.sum())
    n_proxy = len(pi_filtered)

    _lambda = self._compute_tuning_parameter(y_true_filled, y_proxy_filtered, xi, pi_filtered, power_tuning)
    rectified_labels = self._compute_rectified_labels(y_true_filled, y_proxy_filtered, xi, pi_filtered, _lambda)
    mean_estimate = np.mean(rectified_labels)
    std_estimate = np.std(rectified_labels, ddof=1) / np.sqrt(n_proxy)

    confidence_interval = CLTConfidenceInterval(
        mean=mean_estimate, std=std_estimate, confidence_level=confidence_level
    )
    classical_confidence_interval = IPWClassicalMeanEstimator().estimate(y_true, pi).confidence_interval
    effective_sample_size = floor(n_true * classical_confidence_interval.var / confidence_interval.var)

    return PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_true,
        n_proxy=n_proxy,
        effective_sample_size=effective_sample_size,
    )

glide.estimators.ptd.PTDMeanEstimator

Estimator for population mean using Predict-Then-Debias (PTD).

Combines a small set of labeled samples with a large set of unlabeled samples whose labels are approximated by a proxy model. Confidence intervals are constructed via a bootstrap percentile method, requiring no distributional assumptions on the proxy quality.

The bootstrap uses a CLT-based algorithm: the unlabeled proxy mean is computed once on the full unlabeled set and its sampling variability is simulated with a Gaussian draw at each iteration, making the per-iteration cost O(n_labeled) rather than O(n_labeled + n_unlabeled), where n_labeled and n_unlabeled are the number of labeled and unlabeled samples respectively.

References

Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates. "Prediction-powered inference with imputed covariates and nonuniform sampling." arXiv preprint arXiv:2501.18577 (2025).

Examples:

>>> import numpy as np
>>> from glide.estimators import PTDMeanEstimator
>>> y_true = np.array([5.0, 6.0, np.nan, np.nan])
>>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1])
>>> estimator = PTDMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, n_bootstrap=5, random_seed=0)
>>> print(result)
Metric: Metric
Point Estimate: 5.552
Confidence Interval (95%): [5.211, 5.865]
Estimator : PTDMeanEstimator
n_true: 2
n_proxy: 4
Effective Sample Size: 5
Source code in glide/estimators/ptd.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
class PTDMeanEstimator:
    """Estimator for population mean using Predict-Then-Debias (PTD).

    Combines a small set of labeled samples with a large set of unlabeled
    samples whose labels are approximated by a proxy model. Confidence
    intervals are constructed via a bootstrap percentile method, requiring
    no distributional assumptions on the proxy quality.

    The bootstrap uses a CLT-based algorithm: the unlabeled proxy mean is
    computed once on the full unlabeled set and its sampling variability is
    simulated with a Gaussian draw at each iteration, making the per-iteration
    cost O(n_labeled) rather than O(n_labeled + n_unlabeled), where n_labeled
    and n_unlabeled are the number of labeled and unlabeled samples
    respectively.

    References
    ----------
    Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates.
    "Prediction-powered inference with imputed covariates and nonuniform sampling."
    arXiv preprint arXiv:2501.18577 (2025).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import PTDMeanEstimator
    >>> y_true = np.array([5.0, 6.0, np.nan, np.nan])
    >>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1])
    >>> estimator = PTDMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, n_bootstrap=5, random_seed=0)
    >>> print(result)
    Metric: Metric
    Point Estimate: 5.552
    Confidence Interval (95%): [5.211, 5.865]
    Estimator : PTDMeanEstimator
    n_true: 2
    n_proxy: 4
    Effective Sample Size: 5
    """

    def _preprocess(self, y_true_all: NDArray, y_proxy_all: NDArray) -> Tuple[NDArray, NDArray, NDArray]:
        _validate_equal_lengths(y_true_all, y_proxy_all, names=["y_true", "y_proxy"])
        _validate_y_proxy(y_proxy_all)
        _validate_y_true(y_true_all)
        labeled_mask = ~np.isnan(y_true_all)
        _validate_sample_sizes(labeled_mask)
        y_true = y_true_all[labeled_mask]
        y_proxy_labeled = y_proxy_all[labeled_mask]
        y_proxy_unlabeled = y_proxy_all[~labeled_mask]
        return y_true, y_proxy_labeled, y_proxy_unlabeled

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        n_bootstrap: int = 2000,
        power_tuning: bool = True,
        random_seed: Optional[int] = None,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using Predict-Then-Debias (PTD).

        Combines a small set of labeled samples with a large set of unlabeled
        samples whose labels are approximated by a proxy model. The rectifier
        ``mean(y_true) - λ·mean(y_proxy_labeled)`` corrects the bias of the proxy,
        yielding a consistent estimate even when the proxy is imperfect.

        The tuning parameter λ and the confidence interval are both derived from a
        bootstrap over the labeled set only. The sampling variability of the
        unlabeled proxy mean is approximated by a single Gaussian draw per
        iteration, keeping the per-iteration cost O(n_labeled), where n_labeled
        is the number of labeled samples.

        Parameters
        ----------
        y_true : NDArray
            Array of labeled observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
        y_proxy : NDArray
            Array of proxy predictions, shape ``(n_samples,)``.
            Must be fully populated (no NaN). Must have nonzero variance.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        n_bootstrap : int, optional
            Number of bootstrap resamples. Defaults to ``2000``.
        power_tuning : bool, optional
            If ``True`` (default), estimate the optimal tuning parameter λ from
            the bootstrap covariances. If ``False``, use λ = 1.
        random_seed : int, optional
            Seed for the random number generator, for reproducibility.
            Defaults to ``None`` (non-deterministic).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains a ``BootstrapConfidenceInterval``, metric name, estimator
            name (``"PTDMeanEstimator"``), and counts ``n_true`` (labeled samples) and
            ``n_proxy`` (total samples).

        Raises
        ------
        ValueError
            - If ``y_true`` and ``y_proxy`` have different lengths.
            - If any proxy value is NaN.
            - If all proxy values are identical.
            - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
        """
        y_true_filtered, y_proxy_labeled, y_proxy_unlabeled = self._preprocess(y_true, y_proxy)
        n_labeled, n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
        rng = np.random.default_rng(random_seed)

        mean_proxy_unlabeled = np.mean(y_proxy_unlabeled)
        var_proxy_unlabeled = np.var(y_proxy_unlabeled, ddof=1) / n_unlabeled
        bootstrap_y_true_means, bootstrap_y_proxy_labeled_means = _compute_bootstrap_labeled_means(
            y_true_filtered, y_proxy_labeled, n_bootstrap, rng
        )
        lambda_ = _compute_tuning_parameter(
            bootstrap_y_true_means, bootstrap_y_proxy_labeled_means, var_proxy_unlabeled, power_tuning
        )
        bootstrap_mean_estimates = _compute_bootstrap_mean_estimates(
            bootstrap_y_true_means,
            bootstrap_y_proxy_labeled_means,
            mean_proxy_unlabeled,
            var_proxy_unlabeled,
            lambda_,
            rng,
        )

        confidence_interval = BootstrapConfidenceInterval(
            bootstrap_estimates=bootstrap_mean_estimates,
            confidence_level=confidence_level,
        )
        classical_confidence_interval = ClassicalMeanEstimator().estimate(y_true_filtered).confidence_interval
        effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_labeled,
            n_proxy=n_labeled + n_unlabeled,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    metric_name="Metric",
    confidence_level=0.95,
    n_bootstrap=2000,
    power_tuning=True,
    random_seed=None,
)

Estimate the population mean using Predict-Then-Debias (PTD).

Combines a small set of labeled samples with a large set of unlabeled samples whose labels are approximated by a proxy model. The rectifier mean(y_true) - λ·mean(y_proxy_labeled) corrects the bias of the proxy, yielding a consistent estimate even when the proxy is imperfect.

The tuning parameter λ and the confidence interval are both derived from a bootstrap over the labeled set only. The sampling variability of the unlabeled proxy mean is approximated by a single Gaussian draw per iteration, keeping the per-iteration cost O(n_labeled), where n_labeled is the number of labeled samples.

Parameters:

Name Type Description Default
y_true NDArray

Array of labeled observations, shape (n_samples,). Labeled entries are finite; unlabeled entries are np.nan.

required
y_proxy NDArray

Array of proxy predictions, shape (n_samples,). Must be fully populated (no NaN). Must have nonzero variance.

required
metric_name str

Human-readable label for the metric. Defaults to "Metric".

'Metric'
confidence_level float

Target coverage for the confidence interval. Defaults to 0.95.

0.95
n_bootstrap int

Number of bootstrap resamples. Defaults to 2000.

2000
power_tuning bool

If True (default), estimate the optimal tuning parameter λ from the bootstrap covariances. If False, use λ = 1.

True
random_seed int

Seed for the random number generator, for reproducibility. Defaults to None (non-deterministic).

None

Returns:

Type Description
PredictionPoweredMeanInferenceResult

Contains a BootstrapConfidenceInterval, metric name, estimator name ("PTDMeanEstimator"), and counts n_true (labeled samples) and n_proxy (total samples).

Raises:

Type Description
ValueError
  • If y_true and y_proxy have different lengths.
  • If any proxy value is NaN.
  • If all proxy values are identical.
  • If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
Source code in glide/estimators/ptd.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    n_bootstrap: int = 2000,
    power_tuning: bool = True,
    random_seed: Optional[int] = None,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using Predict-Then-Debias (PTD).

    Combines a small set of labeled samples with a large set of unlabeled
    samples whose labels are approximated by a proxy model. The rectifier
    ``mean(y_true) - λ·mean(y_proxy_labeled)`` corrects the bias of the proxy,
    yielding a consistent estimate even when the proxy is imperfect.

    The tuning parameter λ and the confidence interval are both derived from a
    bootstrap over the labeled set only. The sampling variability of the
    unlabeled proxy mean is approximated by a single Gaussian draw per
    iteration, keeping the per-iteration cost O(n_labeled), where n_labeled
    is the number of labeled samples.

    Parameters
    ----------
    y_true : NDArray
        Array of labeled observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
    y_proxy : NDArray
        Array of proxy predictions, shape ``(n_samples,)``.
        Must be fully populated (no NaN). Must have nonzero variance.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    n_bootstrap : int, optional
        Number of bootstrap resamples. Defaults to ``2000``.
    power_tuning : bool, optional
        If ``True`` (default), estimate the optimal tuning parameter λ from
        the bootstrap covariances. If ``False``, use λ = 1.
    random_seed : int, optional
        Seed for the random number generator, for reproducibility.
        Defaults to ``None`` (non-deterministic).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains a ``BootstrapConfidenceInterval``, metric name, estimator
        name (``"PTDMeanEstimator"``), and counts ``n_true`` (labeled samples) and
        ``n_proxy`` (total samples).

    Raises
    ------
    ValueError
        - If ``y_true`` and ``y_proxy`` have different lengths.
        - If any proxy value is NaN.
        - If all proxy values are identical.
        - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
    """
    y_true_filtered, y_proxy_labeled, y_proxy_unlabeled = self._preprocess(y_true, y_proxy)
    n_labeled, n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
    rng = np.random.default_rng(random_seed)

    mean_proxy_unlabeled = np.mean(y_proxy_unlabeled)
    var_proxy_unlabeled = np.var(y_proxy_unlabeled, ddof=1) / n_unlabeled
    bootstrap_y_true_means, bootstrap_y_proxy_labeled_means = _compute_bootstrap_labeled_means(
        y_true_filtered, y_proxy_labeled, n_bootstrap, rng
    )
    lambda_ = _compute_tuning_parameter(
        bootstrap_y_true_means, bootstrap_y_proxy_labeled_means, var_proxy_unlabeled, power_tuning
    )
    bootstrap_mean_estimates = _compute_bootstrap_mean_estimates(
        bootstrap_y_true_means,
        bootstrap_y_proxy_labeled_means,
        mean_proxy_unlabeled,
        var_proxy_unlabeled,
        lambda_,
        rng,
    )

    confidence_interval = BootstrapConfidenceInterval(
        bootstrap_estimates=bootstrap_mean_estimates,
        confidence_level=confidence_level,
    )
    classical_confidence_interval = ClassicalMeanEstimator().estimate(y_true_filtered).confidence_interval
    effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_labeled,
        n_proxy=n_labeled + n_unlabeled,
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.ipw_ptd.IPWPTDMeanEstimator

Estimator for population mean using IPW-corrected Predict-Then-Debias (IPW-PTD).

Extends PTD to handle non-uniform ground-truth labelling probabilities via inverse probability weighting. The bootstrap percentile confidence interval requires no distributional assumptions on the proxy quality. The CLT speedup is applied to the unlabeled proxies. However, inverse probability weighting requires sampling over the whole dataset to compute bootstrap ground-truth mean and labeled proxy mean estimates.

For large sample count (CLT applies), produces inference equivalent to ASIMeanEstimator, but without relying on the normal approximation for the labeled rectifier.

References

Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates. "Prediction-powered inference with imputed covariates and nonuniform sampling." arXiv preprint arXiv:2501.18577 (2025).

Examples:

>>> import numpy as np
>>> from glide.estimators.ipw_ptd import IPWPTDMeanEstimator
>>> y_true = np.array([1.0, 0.0, np.nan, np.nan])
>>> y_proxy = np.array([0.9, 0.1, 0.8, 0.2])
>>> pi = np.array([0.4, 0.6, 0.3, 0.7])
>>> estimator = IPWPTDMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, pi, n_bootstrap=5, random_seed=0)
>>> print(result)
Metric: Metric
Point Estimate: 0.253
Confidence Interval (95%): [-0.082, 0.633]
Estimator : IPWPTDMeanEstimator
n_true: 2
n_proxy: 4
Effective Sample Size: 9
Source code in glide/estimators/ipw_ptd.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
class IPWPTDMeanEstimator:
    """Estimator for population mean using IPW-corrected Predict-Then-Debias (IPW-PTD).

    Extends PTD to handle non-uniform ground-truth labelling probabilities via inverse probability
    weighting. The bootstrap percentile confidence interval requires no distributional
    assumptions on the proxy quality. The CLT speedup is applied to the unlabeled proxies.
    However, inverse probability weighting requires sampling over the whole dataset to
    compute bootstrap ground-truth mean and labeled proxy mean estimates.

    For large sample count (CLT applies), produces inference equivalent to ``ASIMeanEstimator``,
    but without relying on the normal approximation for the labeled rectifier.

    References
    ----------
    Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates.
    "Prediction-powered inference with imputed covariates and nonuniform sampling."
    arXiv preprint arXiv:2501.18577 (2025).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators.ipw_ptd import IPWPTDMeanEstimator
    >>> y_true = np.array([1.0, 0.0, np.nan, np.nan])
    >>> y_proxy = np.array([0.9, 0.1, 0.8, 0.2])
    >>> pi = np.array([0.4, 0.6, 0.3, 0.7])
    >>> estimator = IPWPTDMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, pi, n_bootstrap=5, random_seed=0)
    >>> print(result)
    Metric: Metric
    Point Estimate: 0.253
    Confidence Interval (95%): [-0.082, 0.633]
    Estimator : IPWPTDMeanEstimator
    n_true: 2
    n_proxy: 4
    Effective Sample Size: 9
    """

    def _preprocess(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        pi: NDArray,
    ) -> Tuple[NDArray, NDArray, NDArray, NDArray]:
        _validate_equal_lengths(y_true, y_proxy, pi, names=["y_true", "y_proxy", "pi"])
        _validate_probabilities(pi)
        _validate_y_proxy(y_proxy)
        _validate_y_true(y_true)

        y_true_non_nan_mask = ~np.isnan(y_true)
        xi = y_true_non_nan_mask.astype(float)

        _validate_sample_sizes(y_true_non_nan_mask)
        _validate_label_prob_consistency(y_true_non_nan_mask, pi)

        y_true_filled = np.nan_to_num(y_true, nan=0)
        return y_true_filled, y_proxy, xi, pi

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        pi: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        n_bootstrap: int = 2000,
        power_tuning: bool = True,
        random_seed: Optional[int] = None,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using IPW-corrected Predict-Then-Debias.

        Ground-truth labels were sampled with known, non-uniform probabilities π_i.
        Inverse probability weighting (IPW) corrects for this non-uniform selection,
        yielding valid confidence intervals under any sampling rule.
        The unlabeled proxy mean is not resampled: its sampling variability is injected
        via a single Gaussian draw per iteration (CLT speedup).

        Parameters
        ----------
        y_true : NDArray
            Array of shape ``(n_samples,)`` with ground-truth labels. Use ``np.nan`` for
            unlabeled samples; non-NaN entries are treated as labeled.
        y_proxy : NDArray
            Array of shape ``(n_samples,)`` with proxy predictions. Must be present for every
            sample and must not contain NaN.
        pi : NDArray
            Array of shape ``(n_samples,)`` with the ground-truth labelling probability
            π_i ∈ [0, 1] for each sample.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        n_bootstrap : int, optional
            Number of bootstrap resamples. Defaults to ``2000``.
        power_tuning : bool, optional
            If ``True`` (default), estimates λ from bootstrap covariances to minimise variance.
            If ``False``, uses λ = 1.
        random_seed : int, optional
            Seed for the random number generator, for reproducibility.
            Defaults to ``None`` (non-deterministic).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains a ``BootstrapConfidenceInterval``, metric name, estimator
            name (``"IPWPTDMeanEstimator"``), and counts ``n_true`` (labeled samples) and
            ``n_proxy`` (total samples).

        Raises
        ------
        ValueError
            - If ``y_true``, ``y_proxy``, and ``pi`` do not all have the same length.
            - If any proxy value is NaN.
            - If all proxy values are identical.
            - If any sampling probability is not in [0, 1].
            - If any labeled sample (non-NaN ``y_true``) has a labeling probability of 0.
            - If any unlabeled sample (NaN ``y_true``) has a labeling probability of 1.
            - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
        """
        y_true_filled, y_proxy, xi, pi = self._preprocess(y_true, y_proxy, pi)
        rng = np.random.default_rng(random_seed)

        non_zero_pi_mask = _get_non_zero_mask(pi)
        non_one_pi_mask = _get_non_zero_mask(1 - pi)

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            labeled_ipw_weights = xi / pi
            unlabeled_ipw_weights = (1 - xi) / (1 - pi)

        weighted_y_true_filled = (y_true_filled * labeled_ipw_weights)[non_zero_pi_mask]
        weighted_y_proxy_labeled = (y_proxy * labeled_ipw_weights)[non_zero_pi_mask]
        weighted_y_proxy_unlabeled = (y_proxy * unlabeled_ipw_weights)[non_one_pi_mask]

        mean_proxy_unlabeled = np.mean(weighted_y_proxy_unlabeled)
        effective_n_proxy_unlabeled = len(weighted_y_proxy_unlabeled)
        var_proxy_unlabeled = np.var(weighted_y_proxy_unlabeled, ddof=1) / effective_n_proxy_unlabeled

        bootstrap_y_true_means, bootstrap_y_proxy_labeled_means = _compute_bootstrap_labeled_means(
            weighted_y_true_filled, weighted_y_proxy_labeled, n_bootstrap, rng
        )
        lambda_ = _compute_tuning_parameter(
            bootstrap_y_true_means, bootstrap_y_proxy_labeled_means, var_proxy_unlabeled, power_tuning
        )
        bootstrap_mean_estimates = _compute_bootstrap_mean_estimates(
            bootstrap_y_true_means,
            bootstrap_y_proxy_labeled_means,
            mean_proxy_unlabeled,
            var_proxy_unlabeled,
            lambda_,
            rng,
        )

        n_labeled = int(xi.sum())

        confidence_interval = BootstrapConfidenceInterval(
            bootstrap_estimates=bootstrap_mean_estimates,
            confidence_level=confidence_level,
        )
        classical_confidence_interval = IPWClassicalMeanEstimator().estimate(y_true, pi).confidence_interval
        effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_labeled,
            n_proxy=effective_n_proxy_unlabeled,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    pi,
    metric_name="Metric",
    confidence_level=0.95,
    n_bootstrap=2000,
    power_tuning=True,
    random_seed=None,
)

Estimate the population mean using IPW-corrected Predict-Then-Debias.

Ground-truth labels were sampled with known, non-uniform probabilities π_i. Inverse probability weighting (IPW) corrects for this non-uniform selection, yielding valid confidence intervals under any sampling rule. The unlabeled proxy mean is not resampled: its sampling variability is injected via a single Gaussian draw per iteration (CLT speedup).

Parameters:

Name Type Description Default
y_true NDArray

Array of shape (n_samples,) with ground-truth labels. Use np.nan for unlabeled samples; non-NaN entries are treated as labeled.

required
y_proxy NDArray

Array of shape (n_samples,) with proxy predictions. Must be present for every sample and must not contain NaN.

required
pi NDArray

Array of shape (n_samples,) with the ground-truth labelling probability π_i ∈ [0, 1] for each sample.

required
metric_name str

Human-readable label for the metric. Defaults to "Metric".

'Metric'
confidence_level float

Target coverage for the confidence interval. Defaults to 0.95.

0.95
n_bootstrap int

Number of bootstrap resamples. Defaults to 2000.

2000
power_tuning bool

If True (default), estimates λ from bootstrap covariances to minimise variance. If False, uses λ = 1.

True
random_seed int

Seed for the random number generator, for reproducibility. Defaults to None (non-deterministic).

None

Returns:

Type Description
PredictionPoweredMeanInferenceResult

Contains a BootstrapConfidenceInterval, metric name, estimator name ("IPWPTDMeanEstimator"), and counts n_true (labeled samples) and n_proxy (total samples).

Raises:

Type Description
ValueError
  • If y_true, y_proxy, and pi do not all have the same length.
  • If any proxy value is NaN.
  • If all proxy values are identical.
  • If any sampling probability is not in [0, 1].
  • If any labeled sample (non-NaN y_true) has a labeling probability of 0.
  • If any unlabeled sample (NaN y_true) has a labeling probability of 1.
  • If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
Source code in glide/estimators/ipw_ptd.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    pi: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    n_bootstrap: int = 2000,
    power_tuning: bool = True,
    random_seed: Optional[int] = None,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using IPW-corrected Predict-Then-Debias.

    Ground-truth labels were sampled with known, non-uniform probabilities π_i.
    Inverse probability weighting (IPW) corrects for this non-uniform selection,
    yielding valid confidence intervals under any sampling rule.
    The unlabeled proxy mean is not resampled: its sampling variability is injected
    via a single Gaussian draw per iteration (CLT speedup).

    Parameters
    ----------
    y_true : NDArray
        Array of shape ``(n_samples,)`` with ground-truth labels. Use ``np.nan`` for
        unlabeled samples; non-NaN entries are treated as labeled.
    y_proxy : NDArray
        Array of shape ``(n_samples,)`` with proxy predictions. Must be present for every
        sample and must not contain NaN.
    pi : NDArray
        Array of shape ``(n_samples,)`` with the ground-truth labelling probability
        π_i ∈ [0, 1] for each sample.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    n_bootstrap : int, optional
        Number of bootstrap resamples. Defaults to ``2000``.
    power_tuning : bool, optional
        If ``True`` (default), estimates λ from bootstrap covariances to minimise variance.
        If ``False``, uses λ = 1.
    random_seed : int, optional
        Seed for the random number generator, for reproducibility.
        Defaults to ``None`` (non-deterministic).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains a ``BootstrapConfidenceInterval``, metric name, estimator
        name (``"IPWPTDMeanEstimator"``), and counts ``n_true`` (labeled samples) and
        ``n_proxy`` (total samples).

    Raises
    ------
    ValueError
        - If ``y_true``, ``y_proxy``, and ``pi`` do not all have the same length.
        - If any proxy value is NaN.
        - If all proxy values are identical.
        - If any sampling probability is not in [0, 1].
        - If any labeled sample (non-NaN ``y_true``) has a labeling probability of 0.
        - If any unlabeled sample (NaN ``y_true``) has a labeling probability of 1.
        - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
    """
    y_true_filled, y_proxy, xi, pi = self._preprocess(y_true, y_proxy, pi)
    rng = np.random.default_rng(random_seed)

    non_zero_pi_mask = _get_non_zero_mask(pi)
    non_one_pi_mask = _get_non_zero_mask(1 - pi)

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        labeled_ipw_weights = xi / pi
        unlabeled_ipw_weights = (1 - xi) / (1 - pi)

    weighted_y_true_filled = (y_true_filled * labeled_ipw_weights)[non_zero_pi_mask]
    weighted_y_proxy_labeled = (y_proxy * labeled_ipw_weights)[non_zero_pi_mask]
    weighted_y_proxy_unlabeled = (y_proxy * unlabeled_ipw_weights)[non_one_pi_mask]

    mean_proxy_unlabeled = np.mean(weighted_y_proxy_unlabeled)
    effective_n_proxy_unlabeled = len(weighted_y_proxy_unlabeled)
    var_proxy_unlabeled = np.var(weighted_y_proxy_unlabeled, ddof=1) / effective_n_proxy_unlabeled

    bootstrap_y_true_means, bootstrap_y_proxy_labeled_means = _compute_bootstrap_labeled_means(
        weighted_y_true_filled, weighted_y_proxy_labeled, n_bootstrap, rng
    )
    lambda_ = _compute_tuning_parameter(
        bootstrap_y_true_means, bootstrap_y_proxy_labeled_means, var_proxy_unlabeled, power_tuning
    )
    bootstrap_mean_estimates = _compute_bootstrap_mean_estimates(
        bootstrap_y_true_means,
        bootstrap_y_proxy_labeled_means,
        mean_proxy_unlabeled,
        var_proxy_unlabeled,
        lambda_,
        rng,
    )

    n_labeled = int(xi.sum())

    confidence_interval = BootstrapConfidenceInterval(
        bootstrap_estimates=bootstrap_mean_estimates,
        confidence_level=confidence_level,
    )
    classical_confidence_interval = IPWClassicalMeanEstimator().estimate(y_true, pi).confidence_interval
    effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_labeled,
        n_proxy=effective_n_proxy_unlabeled,
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.stratified_ptd.StratifiedPTDMeanEstimator

Stratified Predict-Then-Debias estimator for population mean.

Extends PTD to datasets partitioned into strata (e.g. by language, domain, or data source). A per-stratum power-tuning parameter is computed independently within each stratum, and the final confidence interval is constructed from a bootstrap distribution obtained by combining the per-stratum bootstrap estimates with weights proportional to the stratum sizes.

This yields narrower confidence intervals than standard PTD whenever strata differ in proxy quality, because the optimal power-tuning parameter can adapt to each stratum's signal-to-noise ratio.

References

Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates. "Prediction-powered inference with imputed covariates and nonuniform sampling." arXiv preprint arXiv:2501.18577 (2025).

Examples:

>>> import numpy as np
>>> from glide.estimators import StratifiedPTDMeanEstimator
>>> y_true = np.array([5.0, 6.0, np.nan, np.nan, 5.0, 6.0, np.nan, np.nan])
>>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1, 4.9, 6.1, 5.2, 6.1])
>>> groups = np.array(["A", "A", "A", "A", "B", "B", "B", "B"])
>>> estimator = StratifiedPTDMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, groups, n_bootstrap=5, random_seed=0)
>>> print(result)
Metric: Metric
Point Estimate: 5.578
Confidence Interval (95%): [5.400, 5.664]
Estimator : StratifiedPTDMeanEstimator
n_true: 4
n_proxy: 8
Effective Sample Size: 33
Source code in glide/estimators/stratified_ptd.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
class StratifiedPTDMeanEstimator:
    """Stratified Predict-Then-Debias estimator for population mean.

    Extends PTD to datasets partitioned into strata (e.g. by language, domain,
    or data source). A per-stratum power-tuning parameter is computed independently
    within each stratum, and the final confidence interval is constructed from a
    bootstrap distribution obtained by combining the per-stratum bootstrap
    estimates with weights proportional to the stratum sizes.

    This yields narrower confidence intervals than standard PTD whenever strata
    differ in proxy quality, because the optimal power-tuning parameter can adapt
    to each stratum's signal-to-noise ratio.

    References
    ----------
    Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates.
    "Prediction-powered inference with imputed covariates and nonuniform sampling."
    arXiv preprint arXiv:2501.18577 (2025).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import StratifiedPTDMeanEstimator
    >>> y_true = np.array([5.0, 6.0, np.nan, np.nan, 5.0, 6.0, np.nan, np.nan])
    >>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1, 4.9, 6.1, 5.2, 6.1])
    >>> groups = np.array(["A", "A", "A", "A", "B", "B", "B", "B"])
    >>> estimator = StratifiedPTDMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, groups, n_bootstrap=5, random_seed=0)
    >>> print(result)
    Metric: Metric
    Point Estimate: 5.578
    Confidence Interval (95%): [5.400, 5.664]
    Estimator : StratifiedPTDMeanEstimator
    n_true: 4
    n_proxy: 8
    Effective Sample Size: 33
    """

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        groups: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        n_bootstrap: int = 2000,
        power_tuning: bool = True,
        random_seed: Optional[int] = None,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using Stratified Predict-Then-Debias.

        Splits arrays by unique values in ``groups``, applies the PTD bootstrap
        algorithm within each stratum with a per-stratum power-tuning, and
        combines the resulting per-stratum bootstrap arrays with weights proportional
        to the stratum sizes into a single ``BootstrapConfidenceInterval``:

            theta = sum_k  w_k * theta_k(lambda_k)

        where ``w_k`` is the fraction of samples in stratum *k* and ``theta_k(lambda_k)``
        is the mean estimate for that stratum computed with power-tuning parameter
        ``lambda_k``.

        Note that this assumes that these fractions reflect the true strata weights
        in the target data distribution which is important for statistical validity.

        Labeled and unlabeled samples are distinguished by ``NaN`` in ``y_true``:
        a sample is labeled if its ``y_true`` entry is not ``NaN``.

        Parameters
        ----------
        y_true : NDArray
            Array of observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
        y_proxy : NDArray
            Array of proxy predictions, shape ``(n_samples,)``.
            Must be fully populated (no NaN). Must have nonzero variance within each stratum.
        groups : NDArray
            Array of stratum identifiers, shape ``(n_samples,)``. Unique values define the strata.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        n_bootstrap : int, optional
            Number of bootstrap resamples. Defaults to ``2000``.
        power_tuning : bool, optional
            If ``True`` (default), estimate the optimal per-stratum power-tuning parameter
            ``lambda_k`` from the bootstrap covariances. If ``False``, use ``lambda_k = 1.0``
            for all strata.
        random_seed : int, optional
            Seed for the random number generator, for reproducibility.
            Defaults to ``None`` (non-deterministic).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains the bootstrap-based confidence interval, the metric name,
            the estimator name (``"StratifiedPTDMeanEstimator"``), and the counts
            ``n_true`` (total labeled rows) and ``n_proxy`` (total dataset size).

        Raises
        ------
        ValueError
            - If ``y_true``, ``y_proxy``, and ``groups`` do not all have the same length.
            - If any proxy value is NaN.
            - If all proxy values within a stratum are identical (zero variance), which would
              cause a division by zero when computing the power-tuning parameter.
            - If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.
        """
        strata = _preprocess(y_true, y_proxy, groups)

        n_samples = len(y_true)
        rng = np.random.default_rng(random_seed)

        weighted_bootstrap_estimates = np.zeros(n_bootstrap)

        for y_true_filtered, y_proxy_labeled, y_proxy_unlabeled in strata:
            stratum_n_labeled, stratum_n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
            stratum_size = stratum_n_labeled + stratum_n_unlabeled
            w_k = stratum_size / n_samples

            mean_proxy_unlabeled_k = np.mean(y_proxy_unlabeled)
            var_proxy_unlabeled_k = np.var(y_proxy_unlabeled, ddof=1) / stratum_n_unlabeled

            bootstrap_y_true_means_k, bootstrap_y_proxy_labeled_means_k = _compute_bootstrap_labeled_means(
                y_true_filtered, y_proxy_labeled, n_bootstrap, rng
            )
            lambda_k = _compute_tuning_parameter(
                bootstrap_y_true_means_k, bootstrap_y_proxy_labeled_means_k, var_proxy_unlabeled_k, power_tuning
            )
            bootstrap_estimates_k = _compute_bootstrap_mean_estimates(
                bootstrap_y_true_means_k,
                bootstrap_y_proxy_labeled_means_k,
                mean_proxy_unlabeled_k,
                var_proxy_unlabeled_k,
                lambda_k,
                rng,
            )

            weighted_bootstrap_estimates += w_k * bootstrap_estimates_k

        confidence_interval = BootstrapConfidenceInterval(
            bootstrap_estimates=weighted_bootstrap_estimates,
            confidence_level=confidence_level,
        )
        _, stratum_counts = np.unique(groups, return_counts=True)
        stratum_weights = stratum_counts / n_samples
        classical_confidence_interval = (
            StratifiedClassicalMeanEstimator()
            .estimate(y_true, groups, stratum_weights=stratum_weights)
            .confidence_interval
        )
        n_labeled = int(np.sum(~np.isnan(y_true)))
        effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_labeled,
            n_proxy=n_samples,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    groups,
    metric_name="Metric",
    confidence_level=0.95,
    n_bootstrap=2000,
    power_tuning=True,
    random_seed=None,
)

Estimate the population mean using Stratified Predict-Then-Debias.

Splits arrays by unique values in groups, applies the PTD bootstrap algorithm within each stratum with a per-stratum power-tuning, and combines the resulting per-stratum bootstrap arrays with weights proportional to the stratum sizes into a single BootstrapConfidenceInterval:

theta = sum_k  w_k * theta_k(lambda_k)

where w_k is the fraction of samples in stratum k and theta_k(lambda_k) is the mean estimate for that stratum computed with power-tuning parameter lambda_k.

Note that this assumes that these fractions reflect the true strata weights in the target data distribution which is important for statistical validity.

Labeled and unlabeled samples are distinguished by NaN in y_true: a sample is labeled if its y_true entry is not NaN.

Parameters:

Name Type Description Default
y_true NDArray

Array of observations, shape (n_samples,). Labeled entries are finite; unlabeled entries are np.nan.

required
y_proxy NDArray

Array of proxy predictions, shape (n_samples,). Must be fully populated (no NaN). Must have nonzero variance within each stratum.

required
groups NDArray

Array of stratum identifiers, shape (n_samples,). Unique values define the strata.

required
metric_name str

Human-readable label for the metric. Defaults to "Metric".

'Metric'
confidence_level float

Target coverage for the confidence interval. Defaults to 0.95.

0.95
n_bootstrap int

Number of bootstrap resamples. Defaults to 2000.

2000
power_tuning bool

If True (default), estimate the optimal per-stratum power-tuning parameter lambda_k from the bootstrap covariances. If False, use lambda_k = 1.0 for all strata.

True
random_seed int

Seed for the random number generator, for reproducibility. Defaults to None (non-deterministic).

None

Returns:

Type Description
PredictionPoweredMeanInferenceResult

Contains the bootstrap-based confidence interval, the metric name, the estimator name ("StratifiedPTDMeanEstimator"), and the counts n_true (total labeled rows) and n_proxy (total dataset size).

Raises:

Type Description
ValueError
  • If y_true, y_proxy, and groups do not all have the same length.
  • If any proxy value is NaN.
  • If all proxy values within a stratum are identical (zero variance), which would cause a division by zero when computing the power-tuning parameter.
  • If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.
Source code in glide/estimators/stratified_ptd.py
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    groups: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    n_bootstrap: int = 2000,
    power_tuning: bool = True,
    random_seed: Optional[int] = None,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using Stratified Predict-Then-Debias.

    Splits arrays by unique values in ``groups``, applies the PTD bootstrap
    algorithm within each stratum with a per-stratum power-tuning, and
    combines the resulting per-stratum bootstrap arrays with weights proportional
    to the stratum sizes into a single ``BootstrapConfidenceInterval``:

        theta = sum_k  w_k * theta_k(lambda_k)

    where ``w_k`` is the fraction of samples in stratum *k* and ``theta_k(lambda_k)``
    is the mean estimate for that stratum computed with power-tuning parameter
    ``lambda_k``.

    Note that this assumes that these fractions reflect the true strata weights
    in the target data distribution which is important for statistical validity.

    Labeled and unlabeled samples are distinguished by ``NaN`` in ``y_true``:
    a sample is labeled if its ``y_true`` entry is not ``NaN``.

    Parameters
    ----------
    y_true : NDArray
        Array of observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
    y_proxy : NDArray
        Array of proxy predictions, shape ``(n_samples,)``.
        Must be fully populated (no NaN). Must have nonzero variance within each stratum.
    groups : NDArray
        Array of stratum identifiers, shape ``(n_samples,)``. Unique values define the strata.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    n_bootstrap : int, optional
        Number of bootstrap resamples. Defaults to ``2000``.
    power_tuning : bool, optional
        If ``True`` (default), estimate the optimal per-stratum power-tuning parameter
        ``lambda_k`` from the bootstrap covariances. If ``False``, use ``lambda_k = 1.0``
        for all strata.
    random_seed : int, optional
        Seed for the random number generator, for reproducibility.
        Defaults to ``None`` (non-deterministic).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains the bootstrap-based confidence interval, the metric name,
        the estimator name (``"StratifiedPTDMeanEstimator"``), and the counts
        ``n_true`` (total labeled rows) and ``n_proxy`` (total dataset size).

    Raises
    ------
    ValueError
        - If ``y_true``, ``y_proxy``, and ``groups`` do not all have the same length.
        - If any proxy value is NaN.
        - If all proxy values within a stratum are identical (zero variance), which would
          cause a division by zero when computing the power-tuning parameter.
        - If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.
    """
    strata = _preprocess(y_true, y_proxy, groups)

    n_samples = len(y_true)
    rng = np.random.default_rng(random_seed)

    weighted_bootstrap_estimates = np.zeros(n_bootstrap)

    for y_true_filtered, y_proxy_labeled, y_proxy_unlabeled in strata:
        stratum_n_labeled, stratum_n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
        stratum_size = stratum_n_labeled + stratum_n_unlabeled
        w_k = stratum_size / n_samples

        mean_proxy_unlabeled_k = np.mean(y_proxy_unlabeled)
        var_proxy_unlabeled_k = np.var(y_proxy_unlabeled, ddof=1) / stratum_n_unlabeled

        bootstrap_y_true_means_k, bootstrap_y_proxy_labeled_means_k = _compute_bootstrap_labeled_means(
            y_true_filtered, y_proxy_labeled, n_bootstrap, rng
        )
        lambda_k = _compute_tuning_parameter(
            bootstrap_y_true_means_k, bootstrap_y_proxy_labeled_means_k, var_proxy_unlabeled_k, power_tuning
        )
        bootstrap_estimates_k = _compute_bootstrap_mean_estimates(
            bootstrap_y_true_means_k,
            bootstrap_y_proxy_labeled_means_k,
            mean_proxy_unlabeled_k,
            var_proxy_unlabeled_k,
            lambda_k,
            rng,
        )

        weighted_bootstrap_estimates += w_k * bootstrap_estimates_k

    confidence_interval = BootstrapConfidenceInterval(
        bootstrap_estimates=weighted_bootstrap_estimates,
        confidence_level=confidence_level,
    )
    _, stratum_counts = np.unique(groups, return_counts=True)
    stratum_weights = stratum_counts / n_samples
    classical_confidence_interval = (
        StratifiedClassicalMeanEstimator()
        .estimate(y_true, groups, stratum_weights=stratum_weights)
        .confidence_interval
    )
    n_labeled = int(np.sum(~np.isnan(y_true)))
    effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_labeled,
        n_proxy=n_samples,
        effective_sample_size=effective_sample_size,
    )
    return result