Estimators

glide.estimators.classical.ClassicalMeanEstimator

Estimator for population mean using the classical sample mean.

Uses only a single array y to compute the sample mean and its standard error via the Central Limit Theorem. This serves as a baseline that does not require proxy predictions.

Examples:

>>> import numpy as np
>>> from glide.estimators import ClassicalMeanEstimator
>>> y = np.array([5.0, 6.0, 4.0, 7.0])
>>> estimator = ClassicalMeanEstimator()
>>> result = estimator.estimate(y)
>>> print(result)
Metric: Metric
Point Estimate: 5.500
Confidence Interval (95%): [4.235, 6.765]
Estimator : ClassicalMeanEstimator
n: 4

Source code in glide/estimators/classical.py

class ClassicalMeanEstimator:
    """Estimator for population mean using the classical sample mean.

    Uses only a single array ``y`` to compute the sample mean and its
    standard error via the Central Limit Theorem. This serves as a baseline
    that does not require proxy predictions.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import ClassicalMeanEstimator
    >>> y = np.array([5.0, 6.0, 4.0, 7.0])
    >>> estimator = ClassicalMeanEstimator()
    >>> result = estimator.estimate(y)
    >>> print(result)
    Metric: Metric
    Point Estimate: 5.500
    Confidence Interval (95%): [4.235, 6.765]
    Estimator : ClassicalMeanEstimator
    n: 4
    """

    def _preprocess(self, y: NDArray) -> NDArray:
        not_nan_mask = ~np.isnan(y)
        y_valid = y[not_nan_mask]
        _validate_min_samples(y_valid, "y")
        return y_valid

    def estimate(
        self,
        y: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
    ) -> ClassicalMeanInferenceResult:
        """Estimate the population mean using the classical sample mean.

        Parameters
        ----------
        y : NDArray
            Array of observations, shape ``(n_samples,)``.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval, e.g. ``0.95``
            for a 95 % CI. Defaults to ``0.95``.

        Returns
        -------
        ClassicalMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"ClassicalMeanEstimator"``), and ``n``
            (number of observations).

        Raises
        ------
        ValueError
            If ``y`` contains fewer than 2 non-NaN values.
        """
        y_valid = self._preprocess(y)
        n_samples = len(y_valid)
        mean = np.mean(y_valid)
        std = np.std(y_valid, ddof=1) / np.sqrt(n_samples)
        ci = CLTConfidenceInterval(
            mean=mean,
            std=std,
            confidence_level=confidence_level,
        )
        result = ClassicalMeanInferenceResult(
            confidence_interval=ci,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n=n_samples,
        )
        return result

estimate

estimate(y, metric_name='Metric', confidence_level=0.95)

Estimate the population mean using the classical sample mean.

Parameters:

Name	Type	Description	Default
`y`	`NDArray`	Array of observations, shape `(n_samples,)`.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval, e.g. `0.95` for a 95 % CI. Defaults to `0.95`.	`0.95`

Returns:

Type	Description
`ClassicalMeanInferenceResult`	Contains the CLT-based confidence interval, the metric name, the estimator name (`"ClassicalMeanEstimator"`), and `n` (number of observations).

Raises:

Type	Description
`ValueError`	If `y` contains fewer than 2 non-NaN values.

Source code in glide/estimators/classical.py

def estimate(
    self,
    y: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
) -> ClassicalMeanInferenceResult:
    """Estimate the population mean using the classical sample mean.

    Parameters
    ----------
    y : NDArray
        Array of observations, shape ``(n_samples,)``.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval, e.g. ``0.95``
        for a 95 % CI. Defaults to ``0.95``.

    Returns
    -------
    ClassicalMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"ClassicalMeanEstimator"``), and ``n``
        (number of observations).

    Raises
    ------
    ValueError
        If ``y`` contains fewer than 2 non-NaN values.
    """
    y_valid = self._preprocess(y)
    n_samples = len(y_valid)
    mean = np.mean(y_valid)
    std = np.std(y_valid, ddof=1) / np.sqrt(n_samples)
    ci = CLTConfidenceInterval(
        mean=mean,
        std=std,
        confidence_level=confidence_level,
    )
    result = ClassicalMeanInferenceResult(
        confidence_interval=ci,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n=n_samples,
    )
    return result

glide.estimators.stratified_classical.StratifiedClassicalMeanEstimator

Stratified classical estimator for population mean.

Extends mean estimation as in ClassicalMeanEstimator to datasets partitioned into strata (e.g. by language, domain, or data source). A per-stratum sample mean and standard error are computed independently, then combined with population-proportional weights.

Examples:

>>> import numpy as np
>>> from glide.estimators import StratifiedClassicalMeanEstimator
>>> y = np.array([1.0, 3.0, 5.0, 7.0])
>>> groups = np.array(["A", "A", "B", "B"])
>>> estimator = StratifiedClassicalMeanEstimator()
>>> result = estimator.estimate(y, groups)
>>> print(result)
Metric: Metric
Point Estimate: 4.000
Confidence Interval (95%): [2.614, 5.386]
Estimator : StratifiedClassicalMeanEstimator
n: 4

Source code in glide/estimators/stratified_classical.py

class StratifiedClassicalMeanEstimator:
    """Stratified classical estimator for population mean.

    Extends mean estimation as in `ClassicalMeanEstimator` to datasets partitioned
    into strata (e.g. by language, domain, or data source). A per-stratum sample
    mean and standard error are computed independently, then combined with
    population-proportional weights.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import StratifiedClassicalMeanEstimator
    >>> y = np.array([1.0, 3.0, 5.0, 7.0])
    >>> groups = np.array(["A", "A", "B", "B"])
    >>> estimator = StratifiedClassicalMeanEstimator()
    >>> result = estimator.estimate(y, groups)
    >>> print(result)
    Metric: Metric
    Point Estimate: 4.000
    Confidence Interval (95%): [2.614, 5.386]
    Estimator : StratifiedClassicalMeanEstimator
    n: 4
    """

    def estimate(
        self,
        y: NDArray,
        groups: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        stratum_weights: Optional[NDArray] = None,
    ) -> ClassicalMeanInferenceResult:
        """Estimate the population mean using stratified classical inference.

        Splits observations by ``groups``, computes a classical sample-mean
        estimate within each stratum, and combines them with stratum weights:

            theta = sum_k  w_k * theta_k
            sigma2 = sum_k  w_k^2 * sigma2_k

        where ``w_k`` is the weight of stratum *k*. By default ``w_k`` is the
        sample fraction ``n_samples_k / n_samples``; pass ``stratum_weights``
        to use a different weighting.

        It is assumed that ``w_k`` reflects the true weight of stratum *k* for
        all *k*.

        Parameters
        ----------
        y : NDArray
            Array of observations.
        groups : NDArray
            Array of group identifiers (same length as ``y``). Unique values
            define the strata.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval, e.g. ``0.95``
            for a 95 % CI. Defaults to ``0.95``.
        stratum_weights : NDArray, optional
            Stratum weights in sorted stratum order. When provided, these
            override the sample-count proportions. Defaults to ``None``
            (infer weights from sample counts).

        Returns
        -------
        ClassicalMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"StratifiedClassicalMeanEstimator"``), and
            ``n`` (total number of samples).

        Raises
        ------
        ValueError
            - If ``groups`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
            - If any stratum contains fewer than 2 non-NaN values.
        """
        _validate_has_no_nan(groups, "groups")
        not_nan_mask = ~np.isnan(y)
        n_samples = np.sum(not_nan_mask)
        weighted_mean = 0.0
        weighted_var = 0.0

        unique_strata = np.unique(groups)
        for i, stratum_id in enumerate(unique_strata):
            stratum_mask = groups == stratum_id
            y_stratum = y[stratum_mask & not_nan_mask]
            _validate_min_samples(y_stratum, "y", stratum_id)

            n_samples_k = len(y_stratum)
            if stratum_weights is not None:
                w_k = stratum_weights[i]
            else:
                w_k = n_samples_k / n_samples
            mean_k = np.mean(y_stratum)
            var_k = np.var(y_stratum, ddof=1) / n_samples_k
            weighted_mean += w_k * mean_k
            weighted_var += w_k**2 * var_k

        std = np.sqrt(weighted_var)
        ci = CLTConfidenceInterval(
            mean=weighted_mean,
            std=std,
            confidence_level=confidence_level,
        )
        result = ClassicalMeanInferenceResult(
            confidence_interval=ci,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n=n_samples,
        )
        return result

estimate

estimate(
    y,
    groups,
    metric_name="Metric",
    confidence_level=0.95,
    stratum_weights=None,
)

Estimate the population mean using stratified classical inference.

Splits observations by groups, computes a classical sample-mean estimate within each stratum, and combines them with stratum weights:

theta = sum_k  w_k * theta_k
sigma2 = sum_k  w_k^2 * sigma2_k

where w_k is the weight of stratum k. By default w_k is the sample fraction n_samples_k / n_samples; pass stratum_weights to use a different weighting.

It is assumed that w_k reflects the true weight of stratum k for all k.

Parameters:

Name	Type	Description	Default
`y`	`NDArray`	Array of observations.	required
`groups`	`NDArray`	Array of group identifiers (same length as `y`). Unique values define the strata.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval, e.g. `0.95` for a 95 % CI. Defaults to `0.95`.	`0.95`
`stratum_weights`	`NDArray`	Stratum weights in sorted stratum order. When provided, these override the sample-count proportions. Defaults to `None` (infer weights from sample counts).	`None`

Returns:

Type	Description
`ClassicalMeanInferenceResult`	Contains the CLT-based confidence interval, the metric name, the estimator name (`"StratifiedClassicalMeanEstimator"`), and `n` (total number of samples).

Raises:

Type	Description
`ValueError`	If `groups` contains NaN values (numeric dtype) or None values (non-numeric dtype). If any stratum contains fewer than 2 non-NaN values.

Source code in glide/estimators/stratified_classical.py

def estimate(
    self,
    y: NDArray,
    groups: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    stratum_weights: Optional[NDArray] = None,
) -> ClassicalMeanInferenceResult:
    """Estimate the population mean using stratified classical inference.

    Splits observations by ``groups``, computes a classical sample-mean
    estimate within each stratum, and combines them with stratum weights:

        theta = sum_k  w_k * theta_k
        sigma2 = sum_k  w_k^2 * sigma2_k

    where ``w_k`` is the weight of stratum *k*. By default ``w_k`` is the
    sample fraction ``n_samples_k / n_samples``; pass ``stratum_weights``
    to use a different weighting.

    It is assumed that ``w_k`` reflects the true weight of stratum *k* for
    all *k*.

    Parameters
    ----------
    y : NDArray
        Array of observations.
    groups : NDArray
        Array of group identifiers (same length as ``y``). Unique values
        define the strata.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval, e.g. ``0.95``
        for a 95 % CI. Defaults to ``0.95``.
    stratum_weights : NDArray, optional
        Stratum weights in sorted stratum order. When provided, these
        override the sample-count proportions. Defaults to ``None``
        (infer weights from sample counts).

    Returns
    -------
    ClassicalMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"StratifiedClassicalMeanEstimator"``), and
        ``n`` (total number of samples).

    Raises
    ------
    ValueError
        - If ``groups`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
        - If any stratum contains fewer than 2 non-NaN values.
    """
    _validate_has_no_nan(groups, "groups")
    not_nan_mask = ~np.isnan(y)
    n_samples = np.sum(not_nan_mask)
    weighted_mean = 0.0
    weighted_var = 0.0

    unique_strata = np.unique(groups)
    for i, stratum_id in enumerate(unique_strata):
        stratum_mask = groups == stratum_id
        y_stratum = y[stratum_mask & not_nan_mask]
        _validate_min_samples(y_stratum, "y", stratum_id)

        n_samples_k = len(y_stratum)
        if stratum_weights is not None:
            w_k = stratum_weights[i]
        else:
            w_k = n_samples_k / n_samples
        mean_k = np.mean(y_stratum)
        var_k = np.var(y_stratum, ddof=1) / n_samples_k
        weighted_mean += w_k * mean_k
        weighted_var += w_k**2 * var_k

    std = np.sqrt(weighted_var)
    ci = CLTConfidenceInterval(
        mean=weighted_mean,
        std=std,
        confidence_level=confidence_level,
    )
    result = ClassicalMeanInferenceResult(
        confidence_interval=ci,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n=n_samples,
    )
    return result

glide.estimators.ipw_classical.IPWClassicalMeanEstimator

Estimator for population mean using Inverse Probability Weighting (IPW).

Extends the classical sample mean to handle non-uniform sampling. Each observation y_i is reweighted by 1/π_i, where π_i is the pre-determined probability that sample i was selected for labeling. Some values of y_i may be NaN corresponding to unsampled instances.

For the computation to be statistically valid, the sum of π_i should be approximately equal to number of observed elements y_i.

Examples:

>>> import numpy as np
>>> from glide.estimators import IPWClassicalMeanEstimator
>>> y = np.array([5.0, 6.0, 4.0, np.nan, np.nan, np.nan])
>>> pi = np.array([0.2, 0.8, 0.6, 0.6, 0.4, 0.4])
>>> estimator = IPWClassicalMeanEstimator()
>>> result = estimator.estimate(y, pi)
>>> print(result)
Metric: Metric
Point Estimate: 6.528
Confidence Interval (95%): [-1.230, 14.286]
Estimator : IPWClassicalMeanEstimator
n: 3

Source code in glide/estimators/ipw_classical.py

class IPWClassicalMeanEstimator:
    """Estimator for population mean using Inverse Probability Weighting (IPW).

    Extends the classical sample mean to handle non-uniform sampling.
    Each observation y_i is reweighted by 1/π_i, where π_i is the
    pre-determined probability that sample i was selected for labeling.
    Some values of y_i may be NaN corresponding to unsampled instances.

    For the computation to be statistically valid, the sum of π_i should be
    approximately equal to number of observed elements y_i.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import IPWClassicalMeanEstimator
    >>> y = np.array([5.0, 6.0, 4.0, np.nan, np.nan, np.nan])
    >>> pi = np.array([0.2, 0.8, 0.6, 0.6, 0.4, 0.4])
    >>> estimator = IPWClassicalMeanEstimator()
    >>> result = estimator.estimate(y, pi)
    >>> print(result)
    Metric: Metric
    Point Estimate: 6.528
    Confidence Interval (95%): [-1.230, 14.286]
    Estimator : IPWClassicalMeanEstimator
    n: 3
    """

    def _preprocess(self, y: NDArray, sampling_probability: NDArray) -> Tuple[NDArray, NDArray]:
        _validate_probabilities(sampling_probability)
        non_zero_pi_mask = _get_non_zero_mask(sampling_probability)
        y_not_nan = ~np.isnan(y)
        _validate_label_prob_consistency(y_not_nan, sampling_probability)
        return y[non_zero_pi_mask], sampling_probability[non_zero_pi_mask]

    def estimate(
        self,
        y: NDArray,
        sampling_probability: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
    ) -> ClassicalMeanInferenceResult:
        """Estimate the population mean using IPW-corrected sample mean.

        Parameters
        ----------
        y : NDArray
            1-D array of observations, may contain unobserved NaN values.
        sampling_probability : NDArray
            1-D array of pre-determined sampling probabilities π_i ∈ [0, 1],
            one per observation. Must have the same length as ``y``.
            Entries with π_i = 0 are excluded from the computation.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.

        Returns
        -------
        ClassicalMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"IPWClassicalMeanEstimator"``), and ``n``
            (number of labeled observations).

        Raises
        ------
        ValueError
            If any value in ``sampling_probability`` is outside of [0, 1].
            If any labeled observation (non-NaN ``y``) has ``sampling_probability`` equal to 0.
        """
        y_non_zero_pi, pi_non_zero = self._preprocess(y, sampling_probability)
        n_labeled = int(np.sum(~np.isnan(y_non_zero_pi)))
        n_samples = len(y_non_zero_pi)
        ipw_weighted_values = np.nan_to_num(y_non_zero_pi, nan=0) / pi_non_zero

        mean = np.mean(ipw_weighted_values)
        std = np.std(ipw_weighted_values, ddof=1) / np.sqrt(n_samples)
        ci = CLTConfidenceInterval(mean=mean, std=std, confidence_level=confidence_level)
        result = ClassicalMeanInferenceResult(
            confidence_interval=ci,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n=n_labeled,
        )
        return result

estimate

estimate(
    y,
    sampling_probability,
    metric_name="Metric",
    confidence_level=0.95,
)

Estimate the population mean using IPW-corrected sample mean.

Parameters:

Name	Type	Description	Default
`y`	`NDArray`	1-D array of observations, may contain unobserved NaN values.	required
`sampling_probability`	`NDArray`	1-D array of pre-determined sampling probabilities π_i ∈ [0, 1], one per observation. Must have the same length as `y`. Entries with π_i = 0 are excluded from the computation.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval. Defaults to `0.95`.	`0.95`

Returns:

Type	Description
`ClassicalMeanInferenceResult`	Contains the CLT-based confidence interval, the metric name, the estimator name (`"IPWClassicalMeanEstimator"`), and `n` (number of labeled observations).

Raises:

Type	Description
`ValueError`	If any value in `sampling_probability` is outside of [0, 1]. If any labeled observation (non-NaN `y`) has `sampling_probability` equal to 0.

Source code in glide/estimators/ipw_classical.py

def estimate(
    self,
    y: NDArray,
    sampling_probability: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
) -> ClassicalMeanInferenceResult:
    """Estimate the population mean using IPW-corrected sample mean.

    Parameters
    ----------
    y : NDArray
        1-D array of observations, may contain unobserved NaN values.
    sampling_probability : NDArray
        1-D array of pre-determined sampling probabilities π_i ∈ [0, 1],
        one per observation. Must have the same length as ``y``.
        Entries with π_i = 0 are excluded from the computation.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.

    Returns
    -------
    ClassicalMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"IPWClassicalMeanEstimator"``), and ``n``
        (number of labeled observations).

    Raises
    ------
    ValueError
        If any value in ``sampling_probability`` is outside of [0, 1].
        If any labeled observation (non-NaN ``y``) has ``sampling_probability`` equal to 0.
    """
    y_non_zero_pi, pi_non_zero = self._preprocess(y, sampling_probability)
    n_labeled = int(np.sum(~np.isnan(y_non_zero_pi)))
    n_samples = len(y_non_zero_pi)
    ipw_weighted_values = np.nan_to_num(y_non_zero_pi, nan=0) / pi_non_zero

    mean = np.mean(ipw_weighted_values)
    std = np.std(ipw_weighted_values, ddof=1) / np.sqrt(n_samples)
    ci = CLTConfidenceInterval(mean=mean, std=std, confidence_level=confidence_level)
    result = ClassicalMeanInferenceResult(
        confidence_interval=ci,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n=n_labeled,
    )
    return result

glide.estimators.clustered_classical.ClusteredClassicalMeanEstimator

Clustered classical estimator for population mean.

Extends mean estimation as in ClassicalMeanEstimator to datasets where observations are grouped into clusters. Each cluster's mean is treated as the sampling unit, which accounts for within-cluster correlation and produces valid confidence intervals under cluster sampling designs.

Examples:

>>> import numpy as np
>>> from glide.estimators import ClusteredClassicalMeanEstimator
>>> y = np.array([5.0, 5.0, 7.0, 7.0])
>>> clusters = np.array(["A", "A", "B", "B"])
>>> estimator = ClusteredClassicalMeanEstimator()
>>> result = estimator.estimate(y, clusters)
>>> print(result)
Metric: Metric
Point Estimate: 6.000
Confidence Interval (95%): [4.040, 7.960]
Estimator : ClusteredClassicalMeanEstimator
n: 4

Source code in glide/estimators/clustered_classical.py

class ClusteredClassicalMeanEstimator:
    """Clustered classical estimator for population mean.

    Extends mean estimation as in ``ClassicalMeanEstimator`` to datasets where
    observations are grouped into clusters. Each cluster's mean is treated as
    the sampling unit, which accounts for within-cluster correlation and
    produces valid confidence intervals under cluster sampling designs.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import ClusteredClassicalMeanEstimator
    >>> y = np.array([5.0, 5.0, 7.0, 7.0])
    >>> clusters = np.array(["A", "A", "B", "B"])
    >>> estimator = ClusteredClassicalMeanEstimator()
    >>> result = estimator.estimate(y, clusters)
    >>> print(result)
    Metric: Metric
    Point Estimate: 6.000
    Confidence Interval (95%): [4.040, 7.960]
    Estimator : ClusteredClassicalMeanEstimator
    n: 4
    """

    def _preprocess(
        self,
        y: NDArray,
        clusters: NDArray,
    ) -> Tuple[NDArray, NDArray, int]:
        _validate_equal_lengths(y, clusters, names=["y", "clusters"])
        _validate_has_no_nan(clusters, "clusters")
        not_nan_mask = ~np.isnan(y)
        y_valid = y[not_nan_mask]
        clusters_valid = clusters[not_nan_mask]

        unique_valid_clusters, cluster_indices = np.unique(clusters_valid, return_inverse=True)
        n_valid_clusters = len(unique_valid_clusters)
        _validate_bounds(
            n_valid_clusters,
            "n_valid_clusters",
            lower=2,
            error_message=f"Need at least 2 clusters with non-NaN observations; got {n_valid_clusters}.",
        )
        return y_valid, cluster_indices, n_valid_clusters

    def estimate(
        self,
        y: NDArray,
        clusters: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
    ) -> ClassicalMeanInferenceResult:
        """Estimate the population mean using the clustered classical estimator.

        Computes within-cluster means and uses them as sampling units to apply
        the CLT:

            theta = (1 / L) * sum_l m_l
            sigma2 = Var(m_l, ddof=1) / L

        where ``m_l = (1/n_l) * sum_{i in l} y_i`` are the cluster means and
        ``L`` is the number of clusters. NaN values in ``y`` are dropped before
        making the computations. Clusters that contain only NaN are not used.

        Parameters
        ----------
        y : NDArray
            Array of observations, shape ``(n_samples,)``. NaN values are
            treated as missing and dropped.
        clusters : NDArray
            Array of cluster identifiers, shape ``(n_samples,)``.
            Unique values define the clusters.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.

        Returns
        -------
        ClassicalMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"ClusteredClassicalMeanEstimator"``), and ``n``
            (total number of non-NaN observations across all clusters).

        Raises
        ------
        ValueError
            - If ``y`` and ``clusters`` do not have the same length.
            - If ``clusters`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
            - If fewer than 2 clusters have at least one non-NaN observation.
        """
        y_valid, cluster_indices, n_valid_clusters = self._preprocess(y, clusters)
        total_size = len(y_valid)

        cluster_sums = np.bincount(cluster_indices, weights=y_valid)
        cluster_sizes = np.bincount(cluster_indices)
        cluster_means = cluster_sums / cluster_sizes

        mean = float(np.mean(cluster_means))
        var = np.var(cluster_means, ddof=1) / n_valid_clusters
        std = np.sqrt(var)

        ci = CLTConfidenceInterval(
            mean=mean,
            std=std,
            confidence_level=confidence_level,
        )
        result = ClassicalMeanInferenceResult(
            confidence_interval=ci,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n=total_size,
        )
        return result

estimate

estimate(
    y, clusters, metric_name="Metric", confidence_level=0.95
)

Estimate the population mean using the clustered classical estimator.

Computes within-cluster means and uses them as sampling units to apply the CLT:

theta = (1 / L) * sum_l m_l
sigma2 = Var(m_l, ddof=1) / L

where m_l = (1/n_l) * sum_{i in l} y_i are the cluster means and L is the number of clusters. NaN values in y are dropped before making the computations. Clusters that contain only NaN are not used.

Parameters:

Name	Type	Description	Default
`y`	`NDArray`	Array of observations, shape `(n_samples,)`. NaN values are treated as missing and dropped.	required
`clusters`	`NDArray`	Array of cluster identifiers, shape `(n_samples,)`. Unique values define the clusters.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval. Defaults to `0.95`.	`0.95`

Returns:

Type	Description
`ClassicalMeanInferenceResult`	Contains the CLT-based confidence interval, the metric name, the estimator name (`"ClusteredClassicalMeanEstimator"`), and `n` (total number of non-NaN observations across all clusters).

Raises:

Type	Description
`ValueError`	If `y` and `clusters` do not have the same length. If `clusters` contains NaN values (numeric dtype) or None values (non-numeric dtype). If fewer than 2 clusters have at least one non-NaN observation.

Source code in glide/estimators/clustered_classical.py

def estimate(
    self,
    y: NDArray,
    clusters: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
) -> ClassicalMeanInferenceResult:
    """Estimate the population mean using the clustered classical estimator.

    Computes within-cluster means and uses them as sampling units to apply
    the CLT:

        theta = (1 / L) * sum_l m_l
        sigma2 = Var(m_l, ddof=1) / L

    where ``m_l = (1/n_l) * sum_{i in l} y_i`` are the cluster means and
    ``L`` is the number of clusters. NaN values in ``y`` are dropped before
    making the computations. Clusters that contain only NaN are not used.

    Parameters
    ----------
    y : NDArray
        Array of observations, shape ``(n_samples,)``. NaN values are
        treated as missing and dropped.
    clusters : NDArray
        Array of cluster identifiers, shape ``(n_samples,)``.
        Unique values define the clusters.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.

    Returns
    -------
    ClassicalMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"ClusteredClassicalMeanEstimator"``), and ``n``
        (total number of non-NaN observations across all clusters).

    Raises
    ------
    ValueError
        - If ``y`` and ``clusters`` do not have the same length.
        - If ``clusters`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
        - If fewer than 2 clusters have at least one non-NaN observation.
    """
    y_valid, cluster_indices, n_valid_clusters = self._preprocess(y, clusters)
    total_size = len(y_valid)

    cluster_sums = np.bincount(cluster_indices, weights=y_valid)
    cluster_sizes = np.bincount(cluster_indices)
    cluster_means = cluster_sums / cluster_sizes

    mean = float(np.mean(cluster_means))
    var = np.var(cluster_means, ddof=1) / n_valid_clusters
    std = np.sqrt(var)

    ci = CLTConfidenceInterval(
        mean=mean,
        std=std,
        confidence_level=confidence_level,
    )
    result = ClassicalMeanInferenceResult(
        confidence_interval=ci,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n=total_size,
    )
    return result

glide.estimators.ppi.PPIMeanEstimator

Estimator for population mean using Prediction-Powered Inference (PPI).

This class implements the PPI method which combines a small set of labeled samples with a large set of unlabeled samples whose labels are approximated by a proxy model. The method provides consistent estimates even when the proxy is imperfect. An optional power-tuning mode (enabled by default) applies the optimal weight λ from PPI++, ensuring the confidence interval is never wider than the one obtained without the proxy.

References

Angelopoulos, Anastasios N., Stephen Bates, Clara Fannjiang, Michael I. Jordan, and Tijana Zrnic. "Prediction-powered inference." Science 382, no. 6671 (2023): 669-674.

Angelopoulos, Anastasios N., John C. Duchi, and Tijana Zrnic. "PPI++: Efficient prediction-powered inference." arXiv preprint arXiv:2311.01453 (2023).

Examples:

>>> import numpy as np
>>> from glide.estimators import PPIMeanEstimator
>>> y_true = np.array([5.0, 6.0, np.nan, np.nan])
>>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1])
>>> estimator = PPIMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy)
>>> print(result)
Metric: Metric
Point Estimate: 5.618
Confidence Interval (95%): [4.923, 6.312]
Estimator : PPIMeanEstimator
n_true: 2
n_proxy: 4
Effective Sample Size: 3

Source code in glide/estimators/ppi.py

class PPIMeanEstimator:
    """Estimator for population mean using Prediction-Powered Inference (PPI).

    This class implements the PPI method which combines a small set of labeled samples
    with a large set of unlabeled samples whose labels are approximated by a proxy model.
    The method provides consistent estimates even when the proxy is imperfect. An optional
    power-tuning mode (enabled by default) applies the optimal weight λ from PPI++,
    ensuring the confidence interval is never wider than the one obtained without the proxy.

    References
    ----------
    Angelopoulos, Anastasios N., Stephen Bates, Clara Fannjiang, Michael I. Jordan, and Tijana
    Zrnic. "Prediction-powered inference." Science 382, no. 6671 (2023): 669-674.

    Angelopoulos, Anastasios N., John C. Duchi, and Tijana Zrnic. "PPI++: Efficient
    prediction-powered inference." arXiv preprint arXiv:2311.01453 (2023).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import PPIMeanEstimator
    >>> y_true = np.array([5.0, 6.0, np.nan, np.nan])
    >>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1])
    >>> estimator = PPIMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy)
    >>> print(result)
    Metric: Metric
    Point Estimate: 5.618
    Confidence Interval (95%): [4.923, 6.312]
    Estimator : PPIMeanEstimator
    n_true: 2
    n_proxy: 4
    Effective Sample Size: 3
    """

    def _preprocess(self, y_true_all: NDArray, y_proxy_all: NDArray) -> Tuple[NDArray, NDArray, NDArray]:
        _validate_equal_lengths(y_true_all, y_proxy_all, names=["y_true", "y_proxy"])
        _validate_y_proxy(y_proxy_all)
        _validate_y_true(y_true_all)
        y_true, y_proxy_labeled, y_proxy_unlabeled, labeled_mask = _split_labeled_unlabeled(y_true_all, y_proxy_all)
        _validate_sample_sizes(labeled_mask)
        return y_true, y_proxy_labeled, y_proxy_unlabeled

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        power_tuning: bool = True,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using Prediction-Powered Inference (PPI).

        Combines a small set of labeled samples with a large set of unlabeled samples whose
        labels are approximated by a proxy (e.g. a pretrained model). The rectifier
        ``mean(y_true) - λ·mean(y_proxy_labeled)`` corrects the bias of the proxy, yielding
        a consistent estimate even when the proxy is imperfect.

        The weight λ interpolates between relying only on ``y_true`` (λ = 0) and the
        standard PPI estimate that leverages both ``y_true`` ``y_proxy`` with equal weights (λ = 1).
        When ``power_tuning=True`` (default), the optimal λ is computed via the PPI++
        closed-form formula to minimise the confidence interval width. When
        ``power_tuning=False``, λ = 1 and the estimator reduces to the classic PPI estimator.

        Parameters
        ----------
        y_true : NDArray
            Array of labeled observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
        y_proxy : NDArray
            Array of proxy predictions, shape ``(n_samples,)``.
            Must be fully populated (no NaN). Must have nonzero variance.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval, e.g. ``0.95``
            for a 95 % CI. Defaults to ``0.95``.
        power_tuning : bool, optional
            If ``True`` (default), compute the optimal λ via the PPI++ formula
            to minimise CI width. If ``False``, use λ = 1 (classic PPI).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"PPIMeanEstimator"``), and the counts
            ``n_true`` (labeled observations) and ``n_proxy`` (all observations
            with a proxy prediction).

        Raises
        ------
        ValueError
            - If ``y_true`` and ``y_proxy`` have different lengths.
            - If any proxy value is NaN.
            - If all proxy values are identical.
            - If labeled ``y_true`` values are constant.
            - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
        """
        y_true_filtered, y_proxy_labeled, y_proxy_unlabeled = self._preprocess(y_true, y_proxy)
        n_labeled, n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
        lambda_ = _compute_tuning_parameter(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, power_tuning)
        mean = _compute_mean_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_)
        std = _compute_std_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_)
        confidence_interval = CLTConfidenceInterval(
            mean=mean,
            std=std,
            confidence_level=confidence_level,
        )
        classical_confidence_interval = ClassicalMeanEstimator().estimate(y_true_filtered).confidence_interval
        effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_labeled,
            n_proxy=n_labeled + n_unlabeled,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    metric_name="Metric",
    confidence_level=0.95,
    power_tuning=True,
)

Estimate the population mean using Prediction-Powered Inference (PPI).

Combines a small set of labeled samples with a large set of unlabeled samples whose labels are approximated by a proxy (e.g. a pretrained model). The rectifier mean(y_true) - λ·mean(y_proxy_labeled) corrects the bias of the proxy, yielding a consistent estimate even when the proxy is imperfect.

The weight λ interpolates between relying only on y_true (λ = 0) and the standard PPI estimate that leverages both y_true y_proxy with equal weights (λ = 1). When power_tuning=True (default), the optimal λ is computed via the PPI++ closed-form formula to minimise the confidence interval width. When power_tuning=False, λ = 1 and the estimator reduces to the classic PPI estimator.

Parameters:

Name	Type	Description	Default
`y_true`	`NDArray`	Array of labeled observations, shape `(n_samples,)`. Labeled entries are finite; unlabeled entries are `np.nan`.	required
`y_proxy`	`NDArray`	Array of proxy predictions, shape `(n_samples,)`. Must be fully populated (no NaN). Must have nonzero variance.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval, e.g. `0.95` for a 95 % CI. Defaults to `0.95`.	`0.95`
`power_tuning`	`bool`	If `True` (default), compute the optimal λ via the PPI++ formula to minimise CI width. If `False`, use λ = 1 (classic PPI).	`True`

Returns:

Type	Description
`PredictionPoweredMeanInferenceResult`	Contains the CLT-based confidence interval, the metric name, the estimator name (`"PPIMeanEstimator"`), and the counts `n_true` (labeled observations) and `n_proxy` (all observations with a proxy prediction).

Raises:

Type	Description
`ValueError`	If `y_true` and `y_proxy` have different lengths. If any proxy value is NaN. If all proxy values are identical. If labeled `y_true` values are constant. If there are fewer than 2 labeled or fewer than 2 unlabeled samples.

Source code in glide/estimators/ppi.py

def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    power_tuning: bool = True,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using Prediction-Powered Inference (PPI).

    Combines a small set of labeled samples with a large set of unlabeled samples whose
    labels are approximated by a proxy (e.g. a pretrained model). The rectifier
    ``mean(y_true) - λ·mean(y_proxy_labeled)`` corrects the bias of the proxy, yielding
    a consistent estimate even when the proxy is imperfect.

    The weight λ interpolates between relying only on ``y_true`` (λ = 0) and the
    standard PPI estimate that leverages both ``y_true`` ``y_proxy`` with equal weights (λ = 1).
    When ``power_tuning=True`` (default), the optimal λ is computed via the PPI++
    closed-form formula to minimise the confidence interval width. When
    ``power_tuning=False``, λ = 1 and the estimator reduces to the classic PPI estimator.

    Parameters
    ----------
    y_true : NDArray
        Array of labeled observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
    y_proxy : NDArray
        Array of proxy predictions, shape ``(n_samples,)``.
        Must be fully populated (no NaN). Must have nonzero variance.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval, e.g. ``0.95``
        for a 95 % CI. Defaults to ``0.95``.
    power_tuning : bool, optional
        If ``True`` (default), compute the optimal λ via the PPI++ formula
        to minimise CI width. If ``False``, use λ = 1 (classic PPI).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"PPIMeanEstimator"``), and the counts
        ``n_true`` (labeled observations) and ``n_proxy`` (all observations
        with a proxy prediction).

    Raises
    ------
    ValueError
        - If ``y_true`` and ``y_proxy`` have different lengths.
        - If any proxy value is NaN.
        - If all proxy values are identical.
        - If labeled ``y_true`` values are constant.
        - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
    """
    y_true_filtered, y_proxy_labeled, y_proxy_unlabeled = self._preprocess(y_true, y_proxy)
    n_labeled, n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
    lambda_ = _compute_tuning_parameter(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, power_tuning)
    mean = _compute_mean_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_)
    std = _compute_std_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_)
    confidence_interval = CLTConfidenceInterval(
        mean=mean,
        std=std,
        confidence_level=confidence_level,
    )
    classical_confidence_interval = ClassicalMeanEstimator().estimate(y_true_filtered).confidence_interval
    effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_labeled,
        n_proxy=n_labeled + n_unlabeled,
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.stratified_ppi.StratifiedPPIMeanEstimator

Stratified PPI++ estimator for population mean.

Extends Prediction-Powered Inference to datasets that are naturally partitioned into strata (e.g. by language, domain, or data source). A per-stratum power-tuned lambda is computed independently for each stratum, and the final estimate is a population-proportional weighted average of the per-stratum PPI++ estimates.

This yields narrower confidence intervals than standard PPI++ whenever strata differ in proxy quality or relative size, because the optimal lambda can adapt to each stratum's signal-to-noise ratio.

References

Fisch, Adam, Joshua Maynez, R. Alex Hofer, Bhuwan Dhingra, Amir Globerson, and William W. Cohen. "Stratified prediction-powered inference for effective hybrid evaluation of language models." Advances in Neural Information Processing Systems 37 (2024): 111489-111514.

Fogliato, Riccardo, Pratik Patil, Mathew Monfort, and Pietro Perona. "A framework for efficient model evaluation through stratification, sampling, and estimation." In European Conference on Computer Vision, pp. 140-158. Cham: Springer Nature Switzerland, 2024.

Examples:

>>> import numpy as np
>>> from glide.estimators import StratifiedPPIMeanEstimator
>>> y_true = np.array([1.0, 2.0, np.nan, np.nan, 4.0, 5.0, np.nan, np.nan])
>>> y_proxy = np.array([1.1, 2.2, 1.5, 1.8, 3.9, 5.1, 4.5, 4.8])
>>> groups = np.array([0, 0, 0, 0, 1, 1, 1, 1])
>>> estimator = StratifiedPPIMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, groups)
>>> print(result)
Metric: Metric
Point Estimate: 3.086
Confidence Interval (95%): [2.720, 3.452]
Estimator : StratifiedPPIMeanEstimator
n_true: 4
n_proxy: 8
Effective Sample Size: 14

Source code in glide/estimators/stratified_ppi.py

class StratifiedPPIMeanEstimator:
    """Stratified PPI++ estimator for population mean.

    Extends Prediction-Powered Inference to datasets that are naturally partitioned
    into strata (e.g. by language, domain, or data source). A per-stratum power-tuned
    lambda is computed independently for each stratum, and the final estimate is a
    population-proportional weighted average of the per-stratum PPI++ estimates.

    This yields narrower confidence intervals than standard PPI++ whenever strata differ
    in proxy quality or relative size, because the optimal lambda can adapt to each
    stratum's signal-to-noise ratio.

    References
    ----------
    Fisch, Adam, Joshua Maynez, R. Alex Hofer, Bhuwan Dhingra, Amir Globerson, and
    William W. Cohen. "Stratified prediction-powered inference for effective hybrid
    evaluation of language models." Advances in Neural Information Processing
    Systems 37 (2024): 111489-111514.

    Fogliato, Riccardo, Pratik Patil, Mathew Monfort, and Pietro Perona. "A framework
    for efficient model evaluation through stratification, sampling, and estimation."
    In European Conference on Computer Vision, pp. 140-158. Cham: Springer Nature
    Switzerland, 2024.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import StratifiedPPIMeanEstimator
    >>> y_true = np.array([1.0, 2.0, np.nan, np.nan, 4.0, 5.0, np.nan, np.nan])
    >>> y_proxy = np.array([1.1, 2.2, 1.5, 1.8, 3.9, 5.1, 4.5, 4.8])
    >>> groups = np.array([0, 0, 0, 0, 1, 1, 1, 1])
    >>> estimator = StratifiedPPIMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, groups)
    >>> print(result)
    Metric: Metric
    Point Estimate: 3.086
    Confidence Interval (95%): [2.720, 3.452]
    Estimator : StratifiedPPIMeanEstimator
    n_true: 4
    n_proxy: 8
    Effective Sample Size: 14
    """

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        groups: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        power_tuning: bool = True,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using Stratified PPI++.

        Splits arrays by unique values in ``groups``, computes a power-tuned PPI++
        estimate within each stratum, and combines them with
        population-proportional weights:

            theta = sum_k  w_k * theta_k(lambda_k)
            sigma2 = sum_k  w_k^2 * sigma2_k(lambda_k)

        where ``w_k`` is the fraction of samples in stratum *k*.

        Note that this assumes the portions of labeled vs unlabeled samples are
        approximately the same in all strata which is important for statistical
        validity.

        Labeled and unlabeled samples are distinguished by ``NaN`` in ``y_true``:
        a sample is labeled if its ``y_true`` entry is not ``NaN``.

        Parameters
        ----------
        y_true : NDArray
            Array of observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
        y_proxy : NDArray
            Array of proxy predictions, shape ``(n_samples,)``.
            Must be fully populated (no NaN). Must have nonzero variance.
        groups : NDArray
            Array of integer stratum identifiers, shape ``(n_samples,)``. Unique
            values define the strata.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        power_tuning : bool, optional
            If ``True`` (default), compute the optimal ``lambda_k`` per stratum
            via the PPI++ formula. If ``False``, use ``lambda_k = 1.0`` for all
            strata (classic PPI).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"StratifiedPPIMeanEstimator"``), and the counts
            ``n_true`` (total labeled rows) and ``n_proxy`` (total dataset size).

        Raises
        ------
        ValueError
            - If ``groups`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
            - If ``y_true``, ``y_proxy``, and ``groups`` do not all have the same length.
            - If any proxy value is NaN.
            - If labeled ``y_true`` values are constant.
            - If all proxy values within a stratum are identical.
            - If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.
        """
        strata = _preprocess(y_true, y_proxy, groups)

        weighted_mean = 0.0
        weighted_var = 0.0
        n_samples = len(y_true)

        for y_true_filtered, y_proxy_labeled, y_proxy_unlabeled in strata:
            stratum_size = len(y_true_filtered) + len(y_proxy_unlabeled)
            w_k = stratum_size / n_samples

            lambda_k = _compute_tuning_parameter(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, power_tuning)
            mean_k = _compute_mean_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_k)
            std_k = _compute_std_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_k)

            weighted_mean += w_k * mean_k
            weighted_var += w_k**2 * std_k**2

        std = np.sqrt(weighted_var)
        n_true = int(np.sum(~np.isnan(y_true)))

        confidence_interval = CLTConfidenceInterval(
            mean=weighted_mean,
            std=std,
            confidence_level=confidence_level,
        )
        _, stratum_counts = np.unique(groups, return_counts=True)
        stratum_weights = stratum_counts / n_samples
        classical_confidence_interval = (
            StratifiedClassicalMeanEstimator()
            .estimate(y_true, groups, stratum_weights=stratum_weights)
            .confidence_interval
        )
        effective_sample_size = floor(n_true * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_true,
            n_proxy=n_samples,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    groups,
    metric_name="Metric",
    confidence_level=0.95,
    power_tuning=True,
)

Estimate the population mean using Stratified PPI++.

Splits arrays by unique values in groups, computes a power-tuned PPI++ estimate within each stratum, and combines them with population-proportional weights:

theta = sum_k  w_k * theta_k(lambda_k)
sigma2 = sum_k  w_k^2 * sigma2_k(lambda_k)

where w_k is the fraction of samples in stratum k.

Note that this assumes the portions of labeled vs unlabeled samples are approximately the same in all strata which is important for statistical validity.

Labeled and unlabeled samples are distinguished by NaN in y_true: a sample is labeled if its y_true entry is not NaN.

Parameters:

Name	Type	Description	Default
`y_true`	`NDArray`	Array of observations, shape `(n_samples,)`. Labeled entries are finite; unlabeled entries are `np.nan`.	required
`y_proxy`	`NDArray`	Array of proxy predictions, shape `(n_samples,)`. Must be fully populated (no NaN). Must have nonzero variance.	required
`groups`	`NDArray`	Array of integer stratum identifiers, shape `(n_samples,)`. Unique values define the strata.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval. Defaults to `0.95`.	`0.95`
`power_tuning`	`bool`	If `True` (default), compute the optimal `lambda_k` per stratum via the PPI++ formula. If `False`, use `lambda_k = 1.0` for all strata (classic PPI).	`True`

Returns:

Type	Description
`PredictionPoweredMeanInferenceResult`	Contains the CLT-based confidence interval, the metric name, the estimator name (`"StratifiedPPIMeanEstimator"`), and the counts `n_true` (total labeled rows) and `n_proxy` (total dataset size).

Raises:

Type	Description
`ValueError`	If `groups` contains NaN values (numeric dtype) or None values (non-numeric dtype). If `y_true`, `y_proxy`, and `groups` do not all have the same length. If any proxy value is NaN. If labeled `y_true` values are constant. If all proxy values within a stratum are identical. If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.

Source code in glide/estimators/stratified_ppi.py

def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    groups: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    power_tuning: bool = True,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using Stratified PPI++.

    Splits arrays by unique values in ``groups``, computes a power-tuned PPI++
    estimate within each stratum, and combines them with
    population-proportional weights:

        theta = sum_k  w_k * theta_k(lambda_k)
        sigma2 = sum_k  w_k^2 * sigma2_k(lambda_k)

    where ``w_k`` is the fraction of samples in stratum *k*.

    Note that this assumes the portions of labeled vs unlabeled samples are
    approximately the same in all strata which is important for statistical
    validity.

    Labeled and unlabeled samples are distinguished by ``NaN`` in ``y_true``:
    a sample is labeled if its ``y_true`` entry is not ``NaN``.

    Parameters
    ----------
    y_true : NDArray
        Array of observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
    y_proxy : NDArray
        Array of proxy predictions, shape ``(n_samples,)``.
        Must be fully populated (no NaN). Must have nonzero variance.
    groups : NDArray
        Array of integer stratum identifiers, shape ``(n_samples,)``. Unique
        values define the strata.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    power_tuning : bool, optional
        If ``True`` (default), compute the optimal ``lambda_k`` per stratum
        via the PPI++ formula. If ``False``, use ``lambda_k = 1.0`` for all
        strata (classic PPI).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"StratifiedPPIMeanEstimator"``), and the counts
        ``n_true`` (total labeled rows) and ``n_proxy`` (total dataset size).

    Raises
    ------
    ValueError
        - If ``groups`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
        - If ``y_true``, ``y_proxy``, and ``groups`` do not all have the same length.
        - If any proxy value is NaN.
        - If labeled ``y_true`` values are constant.
        - If all proxy values within a stratum are identical.
        - If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.
    """
    strata = _preprocess(y_true, y_proxy, groups)

    weighted_mean = 0.0
    weighted_var = 0.0
    n_samples = len(y_true)

    for y_true_filtered, y_proxy_labeled, y_proxy_unlabeled in strata:
        stratum_size = len(y_true_filtered) + len(y_proxy_unlabeled)
        w_k = stratum_size / n_samples

        lambda_k = _compute_tuning_parameter(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, power_tuning)
        mean_k = _compute_mean_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_k)
        std_k = _compute_std_estimate(y_true_filtered, y_proxy_labeled, y_proxy_unlabeled, lambda_k)

        weighted_mean += w_k * mean_k
        weighted_var += w_k**2 * std_k**2

    std = np.sqrt(weighted_var)
    n_true = int(np.sum(~np.isnan(y_true)))

    confidence_interval = CLTConfidenceInterval(
        mean=weighted_mean,
        std=std,
        confidence_level=confidence_level,
    )
    _, stratum_counts = np.unique(groups, return_counts=True)
    stratum_weights = stratum_counts / n_samples
    classical_confidence_interval = (
        StratifiedClassicalMeanEstimator()
        .estimate(y_true, groups, stratum_weights=stratum_weights)
        .confidence_interval
    )
    effective_sample_size = floor(n_true * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_true,
        n_proxy=n_samples,
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.asi.ASIMeanEstimator

Estimator for population mean using Active Statistical Inference (ASI).

This class implements the ASI method which extends PPI++ to non-uniform sampling. Each labeled sample has a known, pre-determined sampling probability π_i. Inverse probability weighting (IPW) corrects for this non-uniform selection, yielding valid confidence intervals under any sampling rule.

The special case where all π_i are equal to n_labeled / n recovers PPI++ at λ = 1.

References

Zrnic, Tijana, and Emmanuel J. Candès. "Active statistical inference." In Proceedings of the 41st International Conference on Machine Learning, pp. 62993-63010. 2024.

Gligorić, Kristina, Tijana Zrnic, Cinoo Lee, Emmanuel Candes, and Dan Jurafsky. "Can unconfident llm annotations be used for confident conclusions?." In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pp. 3514-3533. 2025.

Examples:

>>> import numpy as np
>>> from glide.estimators import ASIMeanEstimator
>>> y_true = np.array([0.0, 1.0, np.nan, np.nan])
>>> y_proxy = np.array([0.1, 0.9, 0.5, 0.5])
>>> pi = np.array([0.8, 0.8, 0.8, 0.8])
>>> estimator = ASIMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, pi)
>>> print(result)
Metric: Metric
Point Estimate: 0.548
Confidence Interval (95%): [0.138, 0.958]
Estimator : ASIMeanEstimator
n_true: 2
n_proxy: 4
Effective Sample Size: 4

Source code in glide/estimators/asi.py

class ASIMeanEstimator:
    """Estimator for population mean using Active Statistical Inference (ASI).

    This class implements the ASI method which extends PPI++ to non-uniform sampling.
    Each labeled sample has a known, pre-determined sampling probability π_i. Inverse
    probability weighting (IPW) corrects for this non-uniform selection, yielding valid
    confidence intervals under any sampling rule.

    The special case where all π_i are equal to n_labeled / n recovers PPI++ at λ = 1.

    References
    ----------
    Zrnic, Tijana, and Emmanuel J. Candès. "Active statistical inference." In Proceedings
    of the 41st International Conference on Machine Learning, pp. 62993-63010. 2024.

    Gligorić, Kristina, Tijana Zrnic, Cinoo Lee, Emmanuel Candes, and Dan Jurafsky.
    "Can unconfident llm annotations be used for confident conclusions?." In Proceedings
    of the 2025 Conference of the Nations of the Americas Chapter of the Association for
    Computational Linguistics: Human Language Technologies (Volume 1: Long Papers),
    pp. 3514-3533. 2025.

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import ASIMeanEstimator
    >>> y_true = np.array([0.0, 1.0, np.nan, np.nan])
    >>> y_proxy = np.array([0.1, 0.9, 0.5, 0.5])
    >>> pi = np.array([0.8, 0.8, 0.8, 0.8])
    >>> estimator = ASIMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, pi)
    >>> print(result)
    Metric: Metric
    Point Estimate: 0.548
    Confidence Interval (95%): [0.138, 0.958]
    Estimator : ASIMeanEstimator
    n_true: 2
    n_proxy: 4
    Effective Sample Size: 4
    """

    def _preprocess(
        self,
        y_true_all: NDArray,
        y_proxy: NDArray,
        pi: NDArray,
    ) -> Tuple[NDArray, NDArray, NDArray, NDArray]:
        _validate_equal_lengths(y_true_all, y_proxy, pi, names=["y_true", "y_proxy", "pi"])
        _validate_y_proxy(y_proxy)
        _validate_probabilities(pi)
        y_true_non_nan_mask = ~np.isnan(y_true_all)
        _validate_label_prob_consistency(y_true_non_nan_mask, pi)
        xi = y_true_non_nan_mask.astype(float)

        non_zero_mask = _get_non_zero_mask(pi)
        y_true_all_filtered = y_true_all[non_zero_mask]
        y_proxy_filtered = y_proxy[non_zero_mask]
        pi_filtered = pi[non_zero_mask]
        xi_filtered = xi[non_zero_mask]

        _validate_non_constant(
            y_proxy_filtered * (xi_filtered / pi_filtered - 1),
            "'y_proxy' values lead to constant rectifiers.",
        )

        y_true_filled = np.nan_to_num(y_true_all_filtered, nan=0)
        return y_true_filled, y_proxy_filtered, xi_filtered, pi_filtered

    def _compute_tuning_parameter(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        xi: NDArray,
        pi: NDArray,
        power_tuning: bool,
    ) -> float:
        if not power_tuning:
            return 1.0
        a = y_proxy * (xi / pi - 1)
        b = y_true * xi / pi
        cov_matrix = np.cov(a, b, ddof=1)
        var, cov = cov_matrix[0]
        _lambda = cov / var
        return _lambda

    def _compute_rectified_labels(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        xi: NDArray,
        pi: NDArray,
        _lambda: float,
    ) -> NDArray:
        rectified_labels = _lambda * y_proxy + xi * (y_true - _lambda * y_proxy) / pi
        return rectified_labels

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        pi: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        power_tuning: bool = True,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using Active Statistical Inference (ASI).

        Uses inverse-probability weighting (IPW) to correct for non-uniform sampling,
        combining labeled and unlabeled samples into a single IPW-corrected estimator.
        A power-tuning step (enabled by default) finds the λ that minimises asymptotic
        variance.

        Parameters
        ----------
        y_true : NDArray
            Array of shape ``(n_samples,)`` with ground-truth labels. Use ``np.nan`` for
            unlabeled samples (ξ_i = 0); non-NaN entries are treated as labeled (ξ_i = 1).
        y_proxy : NDArray
            Array of shape ``(n_samples,)`` with proxy predictions. Must be present for every
            sample and must not contain NaN.
        pi : NDArray
            Array of shape ``(n_samples,)`` with the pre-determined sampling probability
            π_i ∈ [0, 1] for each sample. Entries with π_i = 0 are excluded from all
            computations.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        power_tuning : bool, optional
            If ``True`` (default), selects λ analytically to minimise asymptotic variance.
            If ``False``, uses λ = 1 (plain IPW estimator).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains a ``CLTConfidenceInterval``, metric name, estimator
            name (``"ASIMeanEstimator"``), and counts ``n_true`` (labeled samples) and
            ``n_proxy`` (total samples).

        Raises
        ------
        ValueError
            - If ``y_true``, ``y_proxy``, and ``pi`` do not all have the same length.
            - If any proxy value is NaN.
            - If the rectifiers ``y_proxy * (ξ_i / π_i - 1)`` are constant.
            - If any value in ``pi`` is not in [0, 1].
        """
        y_true_filled, y_proxy_filtered, xi, pi_filtered = self._preprocess(y_true, y_proxy, pi)

        n_true = int(xi.sum())
        n_proxy = len(pi_filtered)

        _lambda = self._compute_tuning_parameter(y_true_filled, y_proxy_filtered, xi, pi_filtered, power_tuning)
        rectified_labels = self._compute_rectified_labels(y_true_filled, y_proxy_filtered, xi, pi_filtered, _lambda)
        mean_estimate = np.mean(rectified_labels)
        std_estimate = np.std(rectified_labels, ddof=1) / np.sqrt(n_proxy)

        confidence_interval = CLTConfidenceInterval(
            mean=mean_estimate, std=std_estimate, confidence_level=confidence_level
        )
        classical_confidence_interval = IPWClassicalMeanEstimator().estimate(y_true, pi).confidence_interval
        effective_sample_size = floor(n_true * classical_confidence_interval.var / confidence_interval.var)

        return PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_true,
            n_proxy=n_proxy,
            effective_sample_size=effective_sample_size,
        )

estimate

estimate(
    y_true,
    y_proxy,
    pi,
    metric_name="Metric",
    confidence_level=0.95,
    power_tuning=True,
)

Estimate the population mean using Active Statistical Inference (ASI).

Uses inverse-probability weighting (IPW) to correct for non-uniform sampling, combining labeled and unlabeled samples into a single IPW-corrected estimator. A power-tuning step (enabled by default) finds the λ that minimises asymptotic variance.

Parameters:

Name	Type	Description	Default
`y_true`	`NDArray`	Array of shape `(n_samples,)` with ground-truth labels. Use `np.nan` for unlabeled samples (ξ_i = 0); non-NaN entries are treated as labeled (ξ_i = 1).	required
`y_proxy`	`NDArray`	Array of shape `(n_samples,)` with proxy predictions. Must be present for every sample and must not contain NaN.	required
`pi`	`NDArray`	Array of shape `(n_samples,)` with the pre-determined sampling probability π_i ∈ [0, 1] for each sample. Entries with π_i = 0 are excluded from all computations.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval. Defaults to `0.95`.	`0.95`
`power_tuning`	`bool`	If `True` (default), selects λ analytically to minimise asymptotic variance. If `False`, uses λ = 1 (plain IPW estimator).	`True`

Returns:

Type	Description
`PredictionPoweredMeanInferenceResult`	Contains a `CLTConfidenceInterval`, metric name, estimator name (`"ASIMeanEstimator"`), and counts `n_true` (labeled samples) and `n_proxy` (total samples).

Raises:

Type	Description
`ValueError`	If `y_true`, `y_proxy`, and `pi` do not all have the same length. If any proxy value is NaN. If the rectifiers `y_proxy * (ξ_i / π_i - 1)` are constant. If any value in `pi` is not in [0, 1].

Source code in glide/estimators/asi.py

def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    pi: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    power_tuning: bool = True,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using Active Statistical Inference (ASI).

    Uses inverse-probability weighting (IPW) to correct for non-uniform sampling,
    combining labeled and unlabeled samples into a single IPW-corrected estimator.
    A power-tuning step (enabled by default) finds the λ that minimises asymptotic
    variance.

    Parameters
    ----------
    y_true : NDArray
        Array of shape ``(n_samples,)`` with ground-truth labels. Use ``np.nan`` for
        unlabeled samples (ξ_i = 0); non-NaN entries are treated as labeled (ξ_i = 1).
    y_proxy : NDArray
        Array of shape ``(n_samples,)`` with proxy predictions. Must be present for every
        sample and must not contain NaN.
    pi : NDArray
        Array of shape ``(n_samples,)`` with the pre-determined sampling probability
        π_i ∈ [0, 1] for each sample. Entries with π_i = 0 are excluded from all
        computations.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    power_tuning : bool, optional
        If ``True`` (default), selects λ analytically to minimise asymptotic variance.
        If ``False``, uses λ = 1 (plain IPW estimator).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains a ``CLTConfidenceInterval``, metric name, estimator
        name (``"ASIMeanEstimator"``), and counts ``n_true`` (labeled samples) and
        ``n_proxy`` (total samples).

    Raises
    ------
    ValueError
        - If ``y_true``, ``y_proxy``, and ``pi`` do not all have the same length.
        - If any proxy value is NaN.
        - If the rectifiers ``y_proxy * (ξ_i / π_i - 1)`` are constant.
        - If any value in ``pi`` is not in [0, 1].
    """
    y_true_filled, y_proxy_filtered, xi, pi_filtered = self._preprocess(y_true, y_proxy, pi)

    n_true = int(xi.sum())
    n_proxy = len(pi_filtered)

    _lambda = self._compute_tuning_parameter(y_true_filled, y_proxy_filtered, xi, pi_filtered, power_tuning)
    rectified_labels = self._compute_rectified_labels(y_true_filled, y_proxy_filtered, xi, pi_filtered, _lambda)
    mean_estimate = np.mean(rectified_labels)
    std_estimate = np.std(rectified_labels, ddof=1) / np.sqrt(n_proxy)

    confidence_interval = CLTConfidenceInterval(
        mean=mean_estimate, std=std_estimate, confidence_level=confidence_level
    )
    classical_confidence_interval = IPWClassicalMeanEstimator().estimate(y_true, pi).confidence_interval
    effective_sample_size = floor(n_true * classical_confidence_interval.var / confidence_interval.var)

    return PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_true,
        n_proxy=n_proxy,
        effective_sample_size=effective_sample_size,
    )

glide.estimators.clustered_ppi.ClusteredPPIMeanEstimator

Clustered PPI++ estimator for population mean.

Extends PPI++ mean estimation as in PPIMeanEstimator to datasets where observations are grouped into clusters. Each cluster's true and proxy means are treated as the sampling units, which accounts for within-cluster correlation and produces valid confidence intervals under cluster sampling designs.

References

Broska, David. "Cluster-robust PPI reference implementation." https://github.com/davidbroska/ppi_py/blob/main/ClusterPPI/mean.py

Examples:

>>> import numpy as np
>>> from glide.estimators import ClusteredPPIMeanEstimator
>>> y_true = np.array([1.0, 2.0, 3.0, 4.0, np.nan, np.nan, np.nan, np.nan])
>>> y_proxy = np.array([1.1, 2.2, 3.1, 3.9, 1.5, 1.8, 4.5, 4.8])
>>> clusters = np.array(["A", "A", "B", "B", "C", "C", "D", "D"])
>>> estimator = ClusteredPPIMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, clusters)
>>> print(result)
Metric: Metric
Point Estimate: 2.744
Confidence Interval (95%): [1.020, 4.468]
Estimator : ClusteredPPIMeanEstimator
n_true: 4
n_proxy: 8
Effective Sample Size: 5

Source code in glide/estimators/clustered_ppi.py

class ClusteredPPIMeanEstimator:
    """Clustered PPI++ estimator for population mean.

    Extends PPI++ mean estimation as in ``PPIMeanEstimator`` to datasets where
    observations are grouped into clusters. Each cluster's true and proxy means
    are treated as the sampling units, which accounts for within-cluster
    correlation and produces valid confidence intervals under cluster sampling
    designs.

    References
    ----------
    Broska, David. "Cluster-robust PPI reference implementation."
    https://github.com/davidbroska/ppi_py/blob/main/ClusterPPI/mean.py

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import ClusteredPPIMeanEstimator
    >>> y_true = np.array([1.0, 2.0, 3.0, 4.0, np.nan, np.nan, np.nan, np.nan])
    >>> y_proxy = np.array([1.1, 2.2, 3.1, 3.9, 1.5, 1.8, 4.5, 4.8])
    >>> clusters = np.array(["A", "A", "B", "B", "C", "C", "D", "D"])
    >>> estimator = ClusteredPPIMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, clusters)
    >>> print(result)
    Metric: Metric
    Point Estimate: 2.744
    Confidence Interval (95%): [1.020, 4.468]
    Estimator : ClusteredPPIMeanEstimator
    n_true: 4
    n_proxy: 8
    Effective Sample Size: 5
    """

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        clusters: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        power_tuning: bool = True,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using the Clustered PPI++ estimator.

        Computes cluster means for labeled and unlabeled clusters and uses them
        as sampling units to apply a PPI++-style bias correction:

            θ̂ = mean(u_l) + λ * (mean(v_l) - mean(s_l))

            Var(θ̂) = Var(u_l - λ*s_l, ddof=1) / M_L
                    + λ² * Var(v_l, ddof=1) / M_U

        where ``u_l`` and ``s_l`` are the true and proxy cluster means for
        labeled clusters, ``v_l`` are the proxy cluster means for unlabeled
        clusters, and ``M_L``, ``M_U`` are the numbers of labeled and unlabeled
        clusters.

        Labeled and unlabeled clusters are distinguished by the NaN pattern in
        ``y_true``: a cluster is labeled if every one of its ``y_true`` entries
        is finite, and unlabeled if every entry is ``np.nan``. Partially labeled
        clusters are not supported.

        Parameters
        ----------
        y_true : NDArray
            Array of observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
            All observations in the same cluster must share the same label status.
        y_proxy : NDArray
            Array of proxy predictions, shape ``(n_samples,)``.
            Must be fully populated (no NaN).
        clusters : NDArray
            Array of cluster identifiers, shape ``(n_samples,)``.
            Unique values define the clusters.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        power_tuning : bool, optional
            If ``True`` (default), estimate the optimal power-tuning parameter
            λ from the pooled cluster-level proxy mean variances. If ``False``,
            use λ = 1.0.

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"ClusteredPPIMeanEstimator"``), and the counts
            ``n_true`` (total labeled observations) and ``n_proxy`` (total
            dataset size).

        Raises
        ------
        ValueError
            - If ``y_true``, ``y_proxy``, and ``clusters`` do not all have the
              same length.
            - If labeled ``y_true`` values are constant.
            - If any proxy value is NaN.
            - If ``clusters`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
            - If any cluster contains both labeled and unlabeled observations.
            - If fewer than 2 clusters are fully labeled.
            - If fewer than 2 clusters are fully unlabeled.
            - If ``power_tuning=True`` and proxy cluster means have zero variance across
              both labeled and unlabeled clusters.
        """
        (
            labeled_true_means,
            labeled_proxy_means,
            unlabeled_proxy_means,
        ) = _preprocess(y_true, y_proxy, clusters)

        _lambda = _compute_tuning_parameter(
            labeled_true_means, labeled_proxy_means, unlabeled_proxy_means, power_tuning
        )
        mean = _compute_mean_estimate(labeled_true_means, labeled_proxy_means, unlabeled_proxy_means, _lambda)
        std = _compute_std_estimate(labeled_true_means, labeled_proxy_means, unlabeled_proxy_means, _lambda)
        confidence_interval = CLTConfidenceInterval(
            mean=mean,
            std=std,
            confidence_level=confidence_level,
        )

        labeled_total_size = np.sum(~np.isnan(y_true))
        classical_confidence_interval = ClusteredClassicalMeanEstimator().estimate(y_true, clusters).confidence_interval
        effective_sample_size = floor(labeled_total_size * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=labeled_total_size,
            n_proxy=len(y_proxy),
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    clusters,
    metric_name="Metric",
    confidence_level=0.95,
    power_tuning=True,
)

Estimate the population mean using the Clustered PPI++ estimator.

Computes cluster means for labeled and unlabeled clusters and uses them as sampling units to apply a PPI++-style bias correction:

θ̂ = mean(u_l) + λ * (mean(v_l) - mean(s_l))

Var(θ̂) = Var(u_l - λ*s_l, ddof=1) / M_L
        + λ² * Var(v_l, ddof=1) / M_U

where u_l and s_l are the true and proxy cluster means for labeled clusters, v_l are the proxy cluster means for unlabeled clusters, and M_L, M_U are the numbers of labeled and unlabeled clusters.

Labeled and unlabeled clusters are distinguished by the NaN pattern in y_true: a cluster is labeled if every one of its y_true entries is finite, and unlabeled if every entry is np.nan. Partially labeled clusters are not supported.

Parameters:

Name	Type	Description	Default
`y_true`	`NDArray`	Array of observations, shape `(n_samples,)`. Labeled entries are finite; unlabeled entries are `np.nan`. All observations in the same cluster must share the same label status.	required
`y_proxy`	`NDArray`	Array of proxy predictions, shape `(n_samples,)`. Must be fully populated (no NaN).	required
`clusters`	`NDArray`	Array of cluster identifiers, shape `(n_samples,)`. Unique values define the clusters.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval. Defaults to `0.95`.	`0.95`
`power_tuning`	`bool`	If `True` (default), estimate the optimal power-tuning parameter λ from the pooled cluster-level proxy mean variances. If `False`, use λ = 1.0.	`True`

Returns:

Type	Description
`PredictionPoweredMeanInferenceResult`	Contains the CLT-based confidence interval, the metric name, the estimator name (`"ClusteredPPIMeanEstimator"`), and the counts `n_true` (total labeled observations) and `n_proxy` (total dataset size).

Raises:

Type Description

ValueError

If y_true, y_proxy, and clusters do not all have the same length.
If labeled y_true values are constant.
If any proxy value is NaN.
If clusters contains NaN values (numeric dtype) or None values (non-numeric dtype).
If any cluster contains both labeled and unlabeled observations.
If fewer than 2 clusters are fully labeled.
If fewer than 2 clusters are fully unlabeled.
If power_tuning=True and proxy cluster means have zero variance across both labeled and unlabeled clusters.

Source code in glide/estimators/clustered_ppi.py

def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    clusters: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    power_tuning: bool = True,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using the Clustered PPI++ estimator.

    Computes cluster means for labeled and unlabeled clusters and uses them
    as sampling units to apply a PPI++-style bias correction:

        θ̂ = mean(u_l) + λ * (mean(v_l) - mean(s_l))

        Var(θ̂) = Var(u_l - λ*s_l, ddof=1) / M_L
                + λ² * Var(v_l, ddof=1) / M_U

    where ``u_l`` and ``s_l`` are the true and proxy cluster means for
    labeled clusters, ``v_l`` are the proxy cluster means for unlabeled
    clusters, and ``M_L``, ``M_U`` are the numbers of labeled and unlabeled
    clusters.

    Labeled and unlabeled clusters are distinguished by the NaN pattern in
    ``y_true``: a cluster is labeled if every one of its ``y_true`` entries
    is finite, and unlabeled if every entry is ``np.nan``. Partially labeled
    clusters are not supported.

    Parameters
    ----------
    y_true : NDArray
        Array of observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
        All observations in the same cluster must share the same label status.
    y_proxy : NDArray
        Array of proxy predictions, shape ``(n_samples,)``.
        Must be fully populated (no NaN).
    clusters : NDArray
        Array of cluster identifiers, shape ``(n_samples,)``.
        Unique values define the clusters.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    power_tuning : bool, optional
        If ``True`` (default), estimate the optimal power-tuning parameter
        λ from the pooled cluster-level proxy mean variances. If ``False``,
        use λ = 1.0.

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"ClusteredPPIMeanEstimator"``), and the counts
        ``n_true`` (total labeled observations) and ``n_proxy`` (total
        dataset size).

    Raises
    ------
    ValueError
        - If ``y_true``, ``y_proxy``, and ``clusters`` do not all have the
          same length.
        - If labeled ``y_true`` values are constant.
        - If any proxy value is NaN.
        - If ``clusters`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
        - If any cluster contains both labeled and unlabeled observations.
        - If fewer than 2 clusters are fully labeled.
        - If fewer than 2 clusters are fully unlabeled.
        - If ``power_tuning=True`` and proxy cluster means have zero variance across
          both labeled and unlabeled clusters.
    """
    (
        labeled_true_means,
        labeled_proxy_means,
        unlabeled_proxy_means,
    ) = _preprocess(y_true, y_proxy, clusters)

    _lambda = _compute_tuning_parameter(
        labeled_true_means, labeled_proxy_means, unlabeled_proxy_means, power_tuning
    )
    mean = _compute_mean_estimate(labeled_true_means, labeled_proxy_means, unlabeled_proxy_means, _lambda)
    std = _compute_std_estimate(labeled_true_means, labeled_proxy_means, unlabeled_proxy_means, _lambda)
    confidence_interval = CLTConfidenceInterval(
        mean=mean,
        std=std,
        confidence_level=confidence_level,
    )

    labeled_total_size = np.sum(~np.isnan(y_true))
    classical_confidence_interval = ClusteredClassicalMeanEstimator().estimate(y_true, clusters).confidence_interval
    effective_sample_size = floor(labeled_total_size * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=labeled_total_size,
        n_proxy=len(y_proxy),
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.multi_ppi.MultiPPIMeanEstimator

Estimator for population mean using Prediction-Powered Inference with multiple proxies.

This class extends PPIMeanEstimator to settings where M >= 1 proxy predictors are available. It finds the optimal tuning parameter vector lambda that minimises the mean squared error of the estimate, then applies the PPI correction with that combined proxy. This power tuning feature (enabled by default) ensures the estimator is always at least as efficient as the naive sample mean, regardless of the quality or number of proxies.

When M = 1, the estimator is equivalent to PPIMeanEstimator with power_tuning=True.

References

Shan, Jiawei, Zhifeng Chen, Yiming Dong, Yazhen Wang, and Jiwei Zhao. "SADA: Safe and Adaptive Aggregation of Multiple Black-Box Predictions in Semi-Supervised Learning." arXiv preprint arXiv:2509.21707 (2025).

Examples:

>>> import numpy as np
>>> from glide.estimators import MultiPPIMeanEstimator
>>> y_true = np.array([5.0, 6.0, np.nan, np.nan])
>>> y_proxies = np.array([[4.9], [6.1], [5.2], [6.1]])
>>> estimator = MultiPPIMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxies)
>>> print(result)
Metric: Metric
Point Estimate: 5.618
Confidence Interval (95%): [4.923, 6.312]
Estimator : MultiPPIMeanEstimator
n_true: 2
n_proxy: 4
Effective Sample Size: 3

Source code in glide/estimators/multi_ppi.py

class MultiPPIMeanEstimator:
    """Estimator for population mean using Prediction-Powered Inference with multiple proxies.

    This class extends PPIMeanEstimator to settings where M >= 1 proxy predictors are
    available. It finds the optimal tuning parameter vector lambda that minimises the mean
    squared error of the estimate, then applies the PPI correction with that combined
    proxy. This power tuning feature (enabled by default) ensures the estimator is
    always at least as efficient as the naive sample mean, regardless of the quality
    or number of proxies.

    When M = 1, the estimator is equivalent to PPIMeanEstimator with power_tuning=True.

    References
    ----------
    Shan, Jiawei, Zhifeng Chen, Yiming Dong, Yazhen Wang, and Jiwei Zhao.
    "SADA: Safe and Adaptive Aggregation of Multiple Black-Box Predictions in Semi-Supervised Learning."
    arXiv preprint arXiv:2509.21707 (2025).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import MultiPPIMeanEstimator
    >>> y_true = np.array([5.0, 6.0, np.nan, np.nan])
    >>> y_proxies = np.array([[4.9], [6.1], [5.2], [6.1]])
    >>> estimator = MultiPPIMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxies)
    >>> print(result)
    Metric: Metric
    Point Estimate: 5.618
    Confidence Interval (95%): [4.923, 6.312]
    Estimator : MultiPPIMeanEstimator
    n_true: 2
    n_proxy: 4
    Effective Sample Size: 3
    """

    def _preprocess(
        self,
        y_true_all: NDArray,
        y_proxies_all: NDArray,
    ) -> Tuple[NDArray, NDArray, NDArray]:
        _validate_equal_lengths(y_true_all, y_proxies_all, names=["y_true", "y_proxies"])
        _validate_y_proxies(y_proxies_all)
        _validate_y_true(y_true_all)
        y_true, y_proxies_labeled, y_proxies_unlabeled, labeled_mask = _split_labeled_unlabeled(
            y_true_all, y_proxies_all
        )
        _validate_sample_sizes(labeled_mask)
        return y_true, y_proxies_labeled, y_proxies_unlabeled

    def estimate(
        self,
        y_true: NDArray,
        y_proxies: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        power_tuning: bool = True,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using MultiPPI.

        Combines a small set of labeled samples with a large set of unlabeled samples,
        leveraging M proxy predictors simultaneously. The optimal tuning parameter vector lambda
        is estimated from the data and used to form a single combined proxy prediction before
        applying the PPI rectifier.

        Parameters
        ----------
        y_true : NDArray
            Array of observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
        y_proxies : NDArray
            2D array of proxy predictions, shape ``(n_samples, M)``.
            Must be fully populated (no NaN). Each column must have nonzero variance.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        power_tuning : bool, optional
            If ``True`` (default), compute the optimal lambda to minimise the confidence
            interval width. If ``False``, set all tuning parameters to ``1/sqrt(M)``
            to limit proxy variance contribution for large M.

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains the CLT-based confidence interval, the metric name,
            the estimator name (``"MultiPPIMeanEstimator"``), and the counts
            ``n_true`` (labeled observations) and ``n_proxy`` (all observations).

        Raises
        ------
        ValueError
            - If ``y_true`` and ``y_proxies`` have different lengths.
            - If ``y_proxies`` is not a 2D array.
            - If any value in ``y_proxies`` is NaN.
            - If any column of ``y_proxies`` is constant.
            - If ``y_true`` contains only NaN or its labeled values are constant.
            - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
            - If the proxy covariance matrix is singular.
        """
        y_true_filtered, y_proxies_labeled, y_proxies_unlabeled = self._preprocess(y_true, y_proxies)
        n_labeled = len(y_true_filtered)
        n_unlabeled = len(y_proxies_unlabeled)
        lambdas_ = _compute_tuning_parameters(y_true_filtered, y_proxies_labeled, y_proxies_unlabeled, power_tuning)
        mean = _compute_mean_estimate(y_true_filtered, y_proxies_labeled, y_proxies_unlabeled, lambdas_)
        std = _compute_std_estimate(y_true_filtered, y_proxies_labeled, y_proxies_unlabeled, lambdas_)
        confidence_interval = CLTConfidenceInterval(
            mean=mean,
            std=std,
            confidence_level=confidence_level,
        )
        classical_confidence_interval = ClassicalMeanEstimator().estimate(y_true_filtered).confidence_interval
        effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_labeled,
            n_proxy=n_labeled + n_unlabeled,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxies,
    metric_name="Metric",
    confidence_level=0.95,
    power_tuning=True,
)

Estimate the population mean using MultiPPI.

Combines a small set of labeled samples with a large set of unlabeled samples, leveraging M proxy predictors simultaneously. The optimal tuning parameter vector lambda is estimated from the data and used to form a single combined proxy prediction before applying the PPI rectifier.

Parameters:

Name	Type	Description	Default
`y_true`	`NDArray`	Array of observations, shape `(n_samples,)`. Labeled entries are finite; unlabeled entries are `np.nan`.	required
`y_proxies`	`NDArray`	2D array of proxy predictions, shape `(n_samples, M)`. Must be fully populated (no NaN). Each column must have nonzero variance.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval. Defaults to `0.95`.	`0.95`
`power_tuning`	`bool`	If `True` (default), compute the optimal lambda to minimise the confidence interval width. If `False`, set all tuning parameters to `1/sqrt(M)` to limit proxy variance contribution for large M.	`True`

Returns:

Type	Description
`PredictionPoweredMeanInferenceResult`	Contains the CLT-based confidence interval, the metric name, the estimator name (`"MultiPPIMeanEstimator"`), and the counts `n_true` (labeled observations) and `n_proxy` (all observations).

Raises:

Type	Description
`ValueError`	If `y_true` and `y_proxies` have different lengths. If `y_proxies` is not a 2D array. If any value in `y_proxies` is NaN. If any column of `y_proxies` is constant. If `y_true` contains only NaN or its labeled values are constant. If there are fewer than 2 labeled or fewer than 2 unlabeled samples. If the proxy covariance matrix is singular.

Source code in glide/estimators/multi_ppi.py

def estimate(
    self,
    y_true: NDArray,
    y_proxies: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    power_tuning: bool = True,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using MultiPPI.

    Combines a small set of labeled samples with a large set of unlabeled samples,
    leveraging M proxy predictors simultaneously. The optimal tuning parameter vector lambda
    is estimated from the data and used to form a single combined proxy prediction before
    applying the PPI rectifier.

    Parameters
    ----------
    y_true : NDArray
        Array of observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
    y_proxies : NDArray
        2D array of proxy predictions, shape ``(n_samples, M)``.
        Must be fully populated (no NaN). Each column must have nonzero variance.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    power_tuning : bool, optional
        If ``True`` (default), compute the optimal lambda to minimise the confidence
        interval width. If ``False``, set all tuning parameters to ``1/sqrt(M)``
        to limit proxy variance contribution for large M.

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains the CLT-based confidence interval, the metric name,
        the estimator name (``"MultiPPIMeanEstimator"``), and the counts
        ``n_true`` (labeled observations) and ``n_proxy`` (all observations).

    Raises
    ------
    ValueError
        - If ``y_true`` and ``y_proxies`` have different lengths.
        - If ``y_proxies`` is not a 2D array.
        - If any value in ``y_proxies`` is NaN.
        - If any column of ``y_proxies`` is constant.
        - If ``y_true`` contains only NaN or its labeled values are constant.
        - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
        - If the proxy covariance matrix is singular.
    """
    y_true_filtered, y_proxies_labeled, y_proxies_unlabeled = self._preprocess(y_true, y_proxies)
    n_labeled = len(y_true_filtered)
    n_unlabeled = len(y_proxies_unlabeled)
    lambdas_ = _compute_tuning_parameters(y_true_filtered, y_proxies_labeled, y_proxies_unlabeled, power_tuning)
    mean = _compute_mean_estimate(y_true_filtered, y_proxies_labeled, y_proxies_unlabeled, lambdas_)
    std = _compute_std_estimate(y_true_filtered, y_proxies_labeled, y_proxies_unlabeled, lambdas_)
    confidence_interval = CLTConfidenceInterval(
        mean=mean,
        std=std,
        confidence_level=confidence_level,
    )
    classical_confidence_interval = ClassicalMeanEstimator().estimate(y_true_filtered).confidence_interval
    effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_labeled,
        n_proxy=n_labeled + n_unlabeled,
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.ptd.PTDMeanEstimator

Estimator for population mean using Predict-Then-Debias (PTD).

Combines a small set of labeled samples with a large set of unlabeled samples whose labels are approximated by a proxy model. Confidence intervals are constructed via a bootstrap percentile method, requiring no distributional assumptions on the proxy quality.

The bootstrap uses a CLT-based algorithm: the unlabeled proxy mean is computed once on the full unlabeled set and its sampling variability is simulated with a Gaussian draw at each iteration, making the per-iteration cost O(n_labeled) rather than O(n_labeled + n_unlabeled), where n_labeled and n_unlabeled are the number of labeled and unlabeled samples respectively.

References

Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates. "Prediction-powered inference with imputed covariates and nonuniform sampling." arXiv preprint arXiv:2501.18577 (2025).

Examples:

>>> import numpy as np
>>> from glide.estimators import PTDMeanEstimator
>>> y_true = np.array([5.0, 6.0, np.nan, np.nan])
>>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1])
>>> estimator = PTDMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, n_bootstrap=5, random_seed=0)
>>> print(result)
Metric: Metric
Point Estimate: 5.552
Confidence Interval (95%): [5.211, 5.865]
Estimator : PTDMeanEstimator
n_true: 2
n_proxy: 4
Effective Sample Size: 5

Source code in glide/estimators/ptd.py

class PTDMeanEstimator:
    """Estimator for population mean using Predict-Then-Debias (PTD).

    Combines a small set of labeled samples with a large set of unlabeled
    samples whose labels are approximated by a proxy model. Confidence
    intervals are constructed via a bootstrap percentile method, requiring
    no distributional assumptions on the proxy quality.

    The bootstrap uses a CLT-based algorithm: the unlabeled proxy mean is
    computed once on the full unlabeled set and its sampling variability is
    simulated with a Gaussian draw at each iteration, making the per-iteration
    cost O(n_labeled) rather than O(n_labeled + n_unlabeled), where n_labeled
    and n_unlabeled are the number of labeled and unlabeled samples
    respectively.

    References
    ----------
    Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates.
    "Prediction-powered inference with imputed covariates and nonuniform sampling."
    arXiv preprint arXiv:2501.18577 (2025).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import PTDMeanEstimator
    >>> y_true = np.array([5.0, 6.0, np.nan, np.nan])
    >>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1])
    >>> estimator = PTDMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, n_bootstrap=5, random_seed=0)
    >>> print(result)
    Metric: Metric
    Point Estimate: 5.552
    Confidence Interval (95%): [5.211, 5.865]
    Estimator : PTDMeanEstimator
    n_true: 2
    n_proxy: 4
    Effective Sample Size: 5
    """

    def _preprocess(self, y_true_all: NDArray, y_proxy_all: NDArray) -> Tuple[NDArray, NDArray, NDArray]:
        _validate_equal_lengths(y_true_all, y_proxy_all, names=["y_true", "y_proxy"])
        _validate_y_proxy(y_proxy_all)
        _validate_y_true(y_true_all)
        y_true, y_proxy_labeled, y_proxy_unlabeled, labeled_mask = _split_labeled_unlabeled(y_true_all, y_proxy_all)
        _validate_sample_sizes(labeled_mask)
        return y_true, y_proxy_labeled, y_proxy_unlabeled

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        n_bootstrap: int = 2000,
        power_tuning: bool = True,
        random_seed: Optional[int] = None,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using Predict-Then-Debias (PTD).

        Combines a small set of labeled samples with a large set of unlabeled
        samples whose labels are approximated by a proxy model. The rectifier
        ``mean(y_true) - λ·mean(y_proxy_labeled)`` corrects the bias of the proxy,
        yielding a consistent estimate even when the proxy is imperfect.

        The tuning parameter λ and the confidence interval are both derived from a
        bootstrap over the labeled set only. The sampling variability of the
        unlabeled proxy mean is approximated by a single Gaussian draw per
        iteration, keeping the per-iteration cost O(n_labeled), where n_labeled
        is the number of labeled samples.

        Parameters
        ----------
        y_true : NDArray
            Array of labeled observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
        y_proxy : NDArray
            Array of proxy predictions, shape ``(n_samples,)``.
            Must be fully populated (no NaN). Must have nonzero variance.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        n_bootstrap : int, optional
            Number of bootstrap resamples. Defaults to ``2000``.
        power_tuning : bool, optional
            If ``True`` (default), estimate the optimal tuning parameter λ from
            the bootstrap covariances. If ``False``, use λ = 1.
        random_seed : int, optional
            Seed for the random number generator, for reproducibility.
            Defaults to ``None`` (non-deterministic).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains a ``BootstrapConfidenceInterval``, metric name, estimator
            name (``"PTDMeanEstimator"``), and counts ``n_true`` (labeled samples) and
            ``n_proxy`` (total samples).

        Raises
        ------
        ValueError
            - If ``y_true`` and ``y_proxy`` have different lengths.
            - If any proxy value is NaN.
            - If all proxy values are identical.
            - If labeled ``y_true`` values are constant.
            - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
        """
        y_true_filtered, y_proxy_labeled, y_proxy_unlabeled = self._preprocess(y_true, y_proxy)
        n_labeled, n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
        rng = np.random.default_rng(random_seed)

        mean_proxy_unlabeled = np.mean(y_proxy_unlabeled)
        var_proxy_unlabeled = np.var(y_proxy_unlabeled, ddof=1) / n_unlabeled
        bootstrap_y_true_means, bootstrap_y_proxy_labeled_means = _compute_bootstrap_labeled_means(
            y_true_filtered, y_proxy_labeled, n_bootstrap, rng
        )
        lambda_ = _compute_tuning_parameter(
            bootstrap_y_true_means, bootstrap_y_proxy_labeled_means, var_proxy_unlabeled, power_tuning
        )
        bootstrap_mean_estimates = _compute_bootstrap_mean_estimates(
            bootstrap_y_true_means,
            bootstrap_y_proxy_labeled_means,
            mean_proxy_unlabeled,
            var_proxy_unlabeled,
            lambda_,
            rng,
        )

        confidence_interval = BootstrapConfidenceInterval(
            bootstrap_estimates=bootstrap_mean_estimates,
            confidence_level=confidence_level,
        )
        classical_confidence_interval = ClassicalMeanEstimator().estimate(y_true_filtered).confidence_interval
        effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_labeled,
            n_proxy=n_labeled + n_unlabeled,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    metric_name="Metric",
    confidence_level=0.95,
    n_bootstrap=2000,
    power_tuning=True,
    random_seed=None,
)

Estimate the population mean using Predict-Then-Debias (PTD).

Combines a small set of labeled samples with a large set of unlabeled samples whose labels are approximated by a proxy model. The rectifier mean(y_true) - λ·mean(y_proxy_labeled) corrects the bias of the proxy, yielding a consistent estimate even when the proxy is imperfect.

The tuning parameter λ and the confidence interval are both derived from a bootstrap over the labeled set only. The sampling variability of the unlabeled proxy mean is approximated by a single Gaussian draw per iteration, keeping the per-iteration cost O(n_labeled), where n_labeled is the number of labeled samples.

Parameters:

Name	Type	Description	Default
`y_true`	`NDArray`	Array of labeled observations, shape `(n_samples,)`. Labeled entries are finite; unlabeled entries are `np.nan`.	required
`y_proxy`	`NDArray`	Array of proxy predictions, shape `(n_samples,)`. Must be fully populated (no NaN). Must have nonzero variance.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval. Defaults to `0.95`.	`0.95`
`n_bootstrap`	`int`	Number of bootstrap resamples. Defaults to `2000`.	`2000`
`power_tuning`	`bool`	If `True` (default), estimate the optimal tuning parameter λ from the bootstrap covariances. If `False`, use λ = 1.	`True`
`random_seed`	`int`	Seed for the random number generator, for reproducibility. Defaults to `None` (non-deterministic).	`None`

Returns:

Type	Description
`PredictionPoweredMeanInferenceResult`	Contains a `BootstrapConfidenceInterval`, metric name, estimator name (`"PTDMeanEstimator"`), and counts `n_true` (labeled samples) and `n_proxy` (total samples).

Raises:

Type	Description
`ValueError`	If `y_true` and `y_proxy` have different lengths. If any proxy value is NaN. If all proxy values are identical. If labeled `y_true` values are constant. If there are fewer than 2 labeled or fewer than 2 unlabeled samples.

Source code in glide/estimators/ptd.py

def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    n_bootstrap: int = 2000,
    power_tuning: bool = True,
    random_seed: Optional[int] = None,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using Predict-Then-Debias (PTD).

    Combines a small set of labeled samples with a large set of unlabeled
    samples whose labels are approximated by a proxy model. The rectifier
    ``mean(y_true) - λ·mean(y_proxy_labeled)`` corrects the bias of the proxy,
    yielding a consistent estimate even when the proxy is imperfect.

    The tuning parameter λ and the confidence interval are both derived from a
    bootstrap over the labeled set only. The sampling variability of the
    unlabeled proxy mean is approximated by a single Gaussian draw per
    iteration, keeping the per-iteration cost O(n_labeled), where n_labeled
    is the number of labeled samples.

    Parameters
    ----------
    y_true : NDArray
        Array of labeled observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
    y_proxy : NDArray
        Array of proxy predictions, shape ``(n_samples,)``.
        Must be fully populated (no NaN). Must have nonzero variance.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    n_bootstrap : int, optional
        Number of bootstrap resamples. Defaults to ``2000``.
    power_tuning : bool, optional
        If ``True`` (default), estimate the optimal tuning parameter λ from
        the bootstrap covariances. If ``False``, use λ = 1.
    random_seed : int, optional
        Seed for the random number generator, for reproducibility.
        Defaults to ``None`` (non-deterministic).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains a ``BootstrapConfidenceInterval``, metric name, estimator
        name (``"PTDMeanEstimator"``), and counts ``n_true`` (labeled samples) and
        ``n_proxy`` (total samples).

    Raises
    ------
    ValueError
        - If ``y_true`` and ``y_proxy`` have different lengths.
        - If any proxy value is NaN.
        - If all proxy values are identical.
        - If labeled ``y_true`` values are constant.
        - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
    """
    y_true_filtered, y_proxy_labeled, y_proxy_unlabeled = self._preprocess(y_true, y_proxy)
    n_labeled, n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
    rng = np.random.default_rng(random_seed)

    mean_proxy_unlabeled = np.mean(y_proxy_unlabeled)
    var_proxy_unlabeled = np.var(y_proxy_unlabeled, ddof=1) / n_unlabeled
    bootstrap_y_true_means, bootstrap_y_proxy_labeled_means = _compute_bootstrap_labeled_means(
        y_true_filtered, y_proxy_labeled, n_bootstrap, rng
    )
    lambda_ = _compute_tuning_parameter(
        bootstrap_y_true_means, bootstrap_y_proxy_labeled_means, var_proxy_unlabeled, power_tuning
    )
    bootstrap_mean_estimates = _compute_bootstrap_mean_estimates(
        bootstrap_y_true_means,
        bootstrap_y_proxy_labeled_means,
        mean_proxy_unlabeled,
        var_proxy_unlabeled,
        lambda_,
        rng,
    )

    confidence_interval = BootstrapConfidenceInterval(
        bootstrap_estimates=bootstrap_mean_estimates,
        confidence_level=confidence_level,
    )
    classical_confidence_interval = ClassicalMeanEstimator().estimate(y_true_filtered).confidence_interval
    effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_labeled,
        n_proxy=n_labeled + n_unlabeled,
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.ipw_ptd.IPWPTDMeanEstimator

Estimator for population mean using IPW-corrected Predict-Then-Debias (IPW-PTD).

Extends PTD to handle non-uniform ground-truth labelling probabilities via inverse probability weighting. The bootstrap percentile confidence interval requires no distributional assumptions on the proxy quality. The CLT speedup is applied to the unlabeled proxies. However, inverse probability weighting requires sampling over the whole dataset to compute bootstrap ground-truth mean and labeled proxy mean estimates.

For large sample count (CLT applies), produces inference equivalent to ASIMeanEstimator, but without relying on the normal approximation for the labeled rectifier.

References

Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates. "Prediction-powered inference with imputed covariates and nonuniform sampling." arXiv preprint arXiv:2501.18577 (2025).

Examples:

>>> import numpy as np
>>> from glide.estimators.ipw_ptd import IPWPTDMeanEstimator
>>> y_true = np.array([1.0, 0.0, np.nan, np.nan])
>>> y_proxy = np.array([0.9, 0.1, 0.8, 0.2])
>>> pi = np.array([0.4, 0.6, 0.3, 0.7])
>>> estimator = IPWPTDMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, pi, n_bootstrap=5, random_seed=0)
>>> print(result)
Metric: Metric
Point Estimate: 0.253
Confidence Interval (95%): [-0.082, 0.633]
Estimator : IPWPTDMeanEstimator
n_true: 2
n_proxy: 4
Effective Sample Size: 9

Source code in glide/estimators/ipw_ptd.py

class IPWPTDMeanEstimator:
    """Estimator for population mean using IPW-corrected Predict-Then-Debias (IPW-PTD).

    Extends PTD to handle non-uniform ground-truth labelling probabilities via inverse probability
    weighting. The bootstrap percentile confidence interval requires no distributional
    assumptions on the proxy quality. The CLT speedup is applied to the unlabeled proxies.
    However, inverse probability weighting requires sampling over the whole dataset to
    compute bootstrap ground-truth mean and labeled proxy mean estimates.

    For large sample count (CLT applies), produces inference equivalent to ``ASIMeanEstimator``,
    but without relying on the normal approximation for the labeled rectifier.

    References
    ----------
    Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates.
    "Prediction-powered inference with imputed covariates and nonuniform sampling."
    arXiv preprint arXiv:2501.18577 (2025).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators.ipw_ptd import IPWPTDMeanEstimator
    >>> y_true = np.array([1.0, 0.0, np.nan, np.nan])
    >>> y_proxy = np.array([0.9, 0.1, 0.8, 0.2])
    >>> pi = np.array([0.4, 0.6, 0.3, 0.7])
    >>> estimator = IPWPTDMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, pi, n_bootstrap=5, random_seed=0)
    >>> print(result)
    Metric: Metric
    Point Estimate: 0.253
    Confidence Interval (95%): [-0.082, 0.633]
    Estimator : IPWPTDMeanEstimator
    n_true: 2
    n_proxy: 4
    Effective Sample Size: 9
    """

    def _preprocess(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        pi: NDArray,
    ) -> Tuple[NDArray, NDArray, NDArray, NDArray]:
        _validate_equal_lengths(y_true, y_proxy, pi, names=["y_true", "y_proxy", "pi"])
        _validate_probabilities(pi)
        _validate_y_proxy(y_proxy)
        _validate_y_true(y_true)

        y_true_non_nan_mask = ~np.isnan(y_true)
        xi = y_true_non_nan_mask.astype(float)

        _validate_sample_sizes(y_true_non_nan_mask)
        _validate_label_prob_consistency(y_true_non_nan_mask, pi)

        y_true_filled = np.nan_to_num(y_true, nan=0)
        return y_true_filled, y_proxy, xi, pi

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        pi: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        n_bootstrap: int = 2000,
        power_tuning: bool = True,
        random_seed: Optional[int] = None,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using IPW-corrected Predict-Then-Debias.

        Ground-truth labels were sampled with known, non-uniform probabilities π_i.
        Inverse probability weighting (IPW) corrects for this non-uniform selection,
        yielding valid confidence intervals under any sampling rule.
        The unlabeled proxy mean is not resampled: its sampling variability is injected
        via a single Gaussian draw per iteration (CLT speedup).

        Parameters
        ----------
        y_true : NDArray
            Array of shape ``(n_samples,)`` with ground-truth labels. Use ``np.nan`` for
            unlabeled samples; non-NaN entries are treated as labeled.
        y_proxy : NDArray
            Array of shape ``(n_samples,)`` with proxy predictions. Must be present for every
            sample and must not contain NaN.
        pi : NDArray
            Array of shape ``(n_samples,)`` with the ground-truth labelling probability
            π_i ∈ [0, 1] for each sample.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        n_bootstrap : int, optional
            Number of bootstrap resamples. Defaults to ``2000``.
        power_tuning : bool, optional
            If ``True`` (default), estimates λ from bootstrap covariances to minimise variance.
            If ``False``, uses λ = 1.
        random_seed : int, optional
            Seed for the random number generator, for reproducibility.
            Defaults to ``None`` (non-deterministic).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains a ``BootstrapConfidenceInterval``, metric name, estimator
            name (``"IPWPTDMeanEstimator"``), and counts ``n_true`` (labeled samples) and
            ``n_proxy`` (total samples).

        Raises
        ------
        ValueError
            - If ``y_true``, ``y_proxy``, and ``pi`` do not all have the same length.
            - If any proxy value is NaN.
            - If all proxy values are identical.
            - If labeled ``y_true`` values are constant.
            - If any sampling probability is not in [0, 1].
            - If any labeled sample (non-NaN ``y_true``) has a labeling probability of 0.
            - If any unlabeled sample (NaN ``y_true``) has a labeling probability of 1.
            - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
        """
        y_true_filled, y_proxy, xi, pi = self._preprocess(y_true, y_proxy, pi)
        rng = np.random.default_rng(random_seed)

        non_zero_pi_mask = _get_non_zero_mask(pi)
        non_one_pi_mask = _get_non_zero_mask(1 - pi)

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            labeled_ipw_weights = xi / pi
            unlabeled_ipw_weights = (1 - xi) / (1 - pi)

        weighted_y_true_filled = (y_true_filled * labeled_ipw_weights)[non_zero_pi_mask]
        weighted_y_proxy_labeled = (y_proxy * labeled_ipw_weights)[non_zero_pi_mask]
        weighted_y_proxy_unlabeled = (y_proxy * unlabeled_ipw_weights)[non_one_pi_mask]

        mean_proxy_unlabeled = np.mean(weighted_y_proxy_unlabeled)
        effective_n_proxy_unlabeled = len(weighted_y_proxy_unlabeled)
        var_proxy_unlabeled = np.var(weighted_y_proxy_unlabeled, ddof=1) / effective_n_proxy_unlabeled

        bootstrap_y_true_means, bootstrap_y_proxy_labeled_means = _compute_bootstrap_labeled_means(
            weighted_y_true_filled, weighted_y_proxy_labeled, n_bootstrap, rng
        )
        lambda_ = _compute_tuning_parameter(
            bootstrap_y_true_means, bootstrap_y_proxy_labeled_means, var_proxy_unlabeled, power_tuning
        )
        bootstrap_mean_estimates = _compute_bootstrap_mean_estimates(
            bootstrap_y_true_means,
            bootstrap_y_proxy_labeled_means,
            mean_proxy_unlabeled,
            var_proxy_unlabeled,
            lambda_,
            rng,
        )

        n_labeled = int(xi.sum())

        confidence_interval = BootstrapConfidenceInterval(
            bootstrap_estimates=bootstrap_mean_estimates,
            confidence_level=confidence_level,
        )
        classical_confidence_interval = IPWClassicalMeanEstimator().estimate(y_true, pi).confidence_interval
        effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_labeled,
            n_proxy=effective_n_proxy_unlabeled,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    pi,
    metric_name="Metric",
    confidence_level=0.95,
    n_bootstrap=2000,
    power_tuning=True,
    random_seed=None,
)

Estimate the population mean using IPW-corrected Predict-Then-Debias.

Ground-truth labels were sampled with known, non-uniform probabilities π_i. Inverse probability weighting (IPW) corrects for this non-uniform selection, yielding valid confidence intervals under any sampling rule. The unlabeled proxy mean is not resampled: its sampling variability is injected via a single Gaussian draw per iteration (CLT speedup).

Parameters:

Name	Type	Description	Default
`y_true`	`NDArray`	Array of shape `(n_samples,)` with ground-truth labels. Use `np.nan` for unlabeled samples; non-NaN entries are treated as labeled.	required
`y_proxy`	`NDArray`	Array of shape `(n_samples,)` with proxy predictions. Must be present for every sample and must not contain NaN.	required
`pi`	`NDArray`	Array of shape `(n_samples,)` with the ground-truth labelling probability π_i ∈ [0, 1] for each sample.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval. Defaults to `0.95`.	`0.95`
`n_bootstrap`	`int`	Number of bootstrap resamples. Defaults to `2000`.	`2000`
`power_tuning`	`bool`	If `True` (default), estimates λ from bootstrap covariances to minimise variance. If `False`, uses λ = 1.	`True`
`random_seed`	`int`	Seed for the random number generator, for reproducibility. Defaults to `None` (non-deterministic).	`None`

Returns:

Type	Description
`PredictionPoweredMeanInferenceResult`	Contains a `BootstrapConfidenceInterval`, metric name, estimator name (`"IPWPTDMeanEstimator"`), and counts `n_true` (labeled samples) and `n_proxy` (total samples).

Raises:

Type Description

ValueError

If y_true, y_proxy, and pi do not all have the same length.
If any proxy value is NaN.
If all proxy values are identical.
If labeled y_true values are constant.
If any sampling probability is not in [0, 1].
If any labeled sample (non-NaN y_true) has a labeling probability of 0.
If any unlabeled sample (NaN y_true) has a labeling probability of 1.
If there are fewer than 2 labeled or fewer than 2 unlabeled samples.

Source code in glide/estimators/ipw_ptd.py

def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    pi: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    n_bootstrap: int = 2000,
    power_tuning: bool = True,
    random_seed: Optional[int] = None,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using IPW-corrected Predict-Then-Debias.

    Ground-truth labels were sampled with known, non-uniform probabilities π_i.
    Inverse probability weighting (IPW) corrects for this non-uniform selection,
    yielding valid confidence intervals under any sampling rule.
    The unlabeled proxy mean is not resampled: its sampling variability is injected
    via a single Gaussian draw per iteration (CLT speedup).

    Parameters
    ----------
    y_true : NDArray
        Array of shape ``(n_samples,)`` with ground-truth labels. Use ``np.nan`` for
        unlabeled samples; non-NaN entries are treated as labeled.
    y_proxy : NDArray
        Array of shape ``(n_samples,)`` with proxy predictions. Must be present for every
        sample and must not contain NaN.
    pi : NDArray
        Array of shape ``(n_samples,)`` with the ground-truth labelling probability
        π_i ∈ [0, 1] for each sample.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    n_bootstrap : int, optional
        Number of bootstrap resamples. Defaults to ``2000``.
    power_tuning : bool, optional
        If ``True`` (default), estimates λ from bootstrap covariances to minimise variance.
        If ``False``, uses λ = 1.
    random_seed : int, optional
        Seed for the random number generator, for reproducibility.
        Defaults to ``None`` (non-deterministic).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains a ``BootstrapConfidenceInterval``, metric name, estimator
        name (``"IPWPTDMeanEstimator"``), and counts ``n_true`` (labeled samples) and
        ``n_proxy`` (total samples).

    Raises
    ------
    ValueError
        - If ``y_true``, ``y_proxy``, and ``pi`` do not all have the same length.
        - If any proxy value is NaN.
        - If all proxy values are identical.
        - If labeled ``y_true`` values are constant.
        - If any sampling probability is not in [0, 1].
        - If any labeled sample (non-NaN ``y_true``) has a labeling probability of 0.
        - If any unlabeled sample (NaN ``y_true``) has a labeling probability of 1.
        - If there are fewer than 2 labeled or fewer than 2 unlabeled samples.
    """
    y_true_filled, y_proxy, xi, pi = self._preprocess(y_true, y_proxy, pi)
    rng = np.random.default_rng(random_seed)

    non_zero_pi_mask = _get_non_zero_mask(pi)
    non_one_pi_mask = _get_non_zero_mask(1 - pi)

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        labeled_ipw_weights = xi / pi
        unlabeled_ipw_weights = (1 - xi) / (1 - pi)

    weighted_y_true_filled = (y_true_filled * labeled_ipw_weights)[non_zero_pi_mask]
    weighted_y_proxy_labeled = (y_proxy * labeled_ipw_weights)[non_zero_pi_mask]
    weighted_y_proxy_unlabeled = (y_proxy * unlabeled_ipw_weights)[non_one_pi_mask]

    mean_proxy_unlabeled = np.mean(weighted_y_proxy_unlabeled)
    effective_n_proxy_unlabeled = len(weighted_y_proxy_unlabeled)
    var_proxy_unlabeled = np.var(weighted_y_proxy_unlabeled, ddof=1) / effective_n_proxy_unlabeled

    bootstrap_y_true_means, bootstrap_y_proxy_labeled_means = _compute_bootstrap_labeled_means(
        weighted_y_true_filled, weighted_y_proxy_labeled, n_bootstrap, rng
    )
    lambda_ = _compute_tuning_parameter(
        bootstrap_y_true_means, bootstrap_y_proxy_labeled_means, var_proxy_unlabeled, power_tuning
    )
    bootstrap_mean_estimates = _compute_bootstrap_mean_estimates(
        bootstrap_y_true_means,
        bootstrap_y_proxy_labeled_means,
        mean_proxy_unlabeled,
        var_proxy_unlabeled,
        lambda_,
        rng,
    )

    n_labeled = int(xi.sum())

    confidence_interval = BootstrapConfidenceInterval(
        bootstrap_estimates=bootstrap_mean_estimates,
        confidence_level=confidence_level,
    )
    classical_confidence_interval = IPWClassicalMeanEstimator().estimate(y_true, pi).confidence_interval
    effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_labeled,
        n_proxy=effective_n_proxy_unlabeled,
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.stratified_ptd.StratifiedPTDMeanEstimator

Stratified Predict-Then-Debias estimator for population mean.

Extends PTD to datasets partitioned into strata (e.g. by language, domain, or data source). A per-stratum power-tuning parameter is computed independently within each stratum, and the final confidence interval is constructed from a bootstrap distribution obtained by combining the per-stratum bootstrap estimates with weights proportional to the stratum sizes.

This yields narrower confidence intervals than standard PTD whenever strata differ in proxy quality, because the optimal power-tuning parameter can adapt to each stratum's signal-to-noise ratio.

References

Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates. "Prediction-powered inference with imputed covariates and nonuniform sampling." arXiv preprint arXiv:2501.18577 (2025).

Examples:

>>> import numpy as np
>>> from glide.estimators import StratifiedPTDMeanEstimator
>>> y_true = np.array([5.0, 6.0, np.nan, np.nan, 5.0, 6.0, np.nan, np.nan])
>>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1, 4.9, 6.1, 5.2, 6.1])
>>> groups = np.array(["A", "A", "A", "A", "B", "B", "B", "B"])
>>> estimator = StratifiedPTDMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, groups, n_bootstrap=5, random_seed=0)
>>> print(result)
Metric: Metric
Point Estimate: 5.578
Confidence Interval (95%): [5.400, 5.664]
Estimator : StratifiedPTDMeanEstimator
n_true: 4
n_proxy: 8
Effective Sample Size: 33

Source code in glide/estimators/stratified_ptd.py

class StratifiedPTDMeanEstimator:
    """Stratified Predict-Then-Debias estimator for population mean.

    Extends PTD to datasets partitioned into strata (e.g. by language, domain,
    or data source). A per-stratum power-tuning parameter is computed independently
    within each stratum, and the final confidence interval is constructed from a
    bootstrap distribution obtained by combining the per-stratum bootstrap
    estimates with weights proportional to the stratum sizes.

    This yields narrower confidence intervals than standard PTD whenever strata
    differ in proxy quality, because the optimal power-tuning parameter can adapt
    to each stratum's signal-to-noise ratio.

    References
    ----------
    Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates.
    "Prediction-powered inference with imputed covariates and nonuniform sampling."
    arXiv preprint arXiv:2501.18577 (2025).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import StratifiedPTDMeanEstimator
    >>> y_true = np.array([5.0, 6.0, np.nan, np.nan, 5.0, 6.0, np.nan, np.nan])
    >>> y_proxy = np.array([4.9, 6.1, 5.2, 6.1, 4.9, 6.1, 5.2, 6.1])
    >>> groups = np.array(["A", "A", "A", "A", "B", "B", "B", "B"])
    >>> estimator = StratifiedPTDMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, groups, n_bootstrap=5, random_seed=0)
    >>> print(result)
    Metric: Metric
    Point Estimate: 5.578
    Confidence Interval (95%): [5.400, 5.664]
    Estimator : StratifiedPTDMeanEstimator
    n_true: 4
    n_proxy: 8
    Effective Sample Size: 33
    """

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        groups: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        n_bootstrap: int = 2000,
        power_tuning: bool = True,
        random_seed: Optional[int] = None,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using Stratified Predict-Then-Debias.

        Splits arrays by unique values in ``groups``, applies the PTD bootstrap
        algorithm within each stratum with a per-stratum power-tuning, and
        combines the resulting per-stratum bootstrap arrays with weights proportional
        to the stratum sizes into a single ``BootstrapConfidenceInterval``:

            theta = sum_k  w_k * theta_k(lambda_k)

        where ``w_k`` is the fraction of samples in stratum *k* and ``theta_k(lambda_k)``
        is the mean estimate for that stratum computed with power-tuning parameter
        ``lambda_k``.

        Note that this assumes that these fractions reflect the true strata weights
        in the target data distribution which is important for statistical validity.

        Labeled and unlabeled samples are distinguished by ``NaN`` in ``y_true``:
        a sample is labeled if its ``y_true`` entry is not ``NaN``.

        Parameters
        ----------
        y_true : NDArray
            Array of observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
        y_proxy : NDArray
            Array of proxy predictions, shape ``(n_samples,)``.
            Must be fully populated (no NaN). Must have nonzero variance within each stratum.
        groups : NDArray
            Array of stratum identifiers, shape ``(n_samples,)``. Unique values define the strata.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        n_bootstrap : int, optional
            Number of bootstrap resamples. Defaults to ``2000``.
        power_tuning : bool, optional
            If ``True`` (default), estimate the optimal per-stratum power-tuning parameter
            ``lambda_k`` from the bootstrap covariances. If ``False``, use ``lambda_k = 1.0``
            for all strata.
        random_seed : int, optional
            Seed for the random number generator, for reproducibility.
            Defaults to ``None`` (non-deterministic).

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains the bootstrap-based confidence interval, the metric name,
            the estimator name (``"StratifiedPTDMeanEstimator"``), and the counts
            ``n_true`` (total labeled rows) and ``n_proxy`` (total dataset size).

        Raises
        ------
        ValueError
            - If ``groups`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
            - If ``y_true``, ``y_proxy``, and ``groups`` do not all have the same length.
            - If any proxy value is NaN.
            - If labeled ``y_true`` values are constant.
            - If all proxy values within a stratum are identical (zero variance), which would
              cause a division by zero when computing the power-tuning parameter.
            - If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.
        """
        strata = _preprocess(y_true, y_proxy, groups)

        n_samples = len(y_true)
        rng = np.random.default_rng(random_seed)

        weighted_bootstrap_estimates = np.zeros(n_bootstrap)

        for y_true_filtered, y_proxy_labeled, y_proxy_unlabeled in strata:
            stratum_n_labeled, stratum_n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
            stratum_size = stratum_n_labeled + stratum_n_unlabeled
            w_k = stratum_size / n_samples

            mean_proxy_unlabeled_k = np.mean(y_proxy_unlabeled)
            var_proxy_unlabeled_k = np.var(y_proxy_unlabeled, ddof=1) / stratum_n_unlabeled

            bootstrap_y_true_means_k, bootstrap_y_proxy_labeled_means_k = _compute_bootstrap_labeled_means(
                y_true_filtered, y_proxy_labeled, n_bootstrap, rng
            )
            lambda_k = _compute_tuning_parameter(
                bootstrap_y_true_means_k, bootstrap_y_proxy_labeled_means_k, var_proxy_unlabeled_k, power_tuning
            )
            bootstrap_estimates_k = _compute_bootstrap_mean_estimates(
                bootstrap_y_true_means_k,
                bootstrap_y_proxy_labeled_means_k,
                mean_proxy_unlabeled_k,
                var_proxy_unlabeled_k,
                lambda_k,
                rng,
            )

            weighted_bootstrap_estimates += w_k * bootstrap_estimates_k

        confidence_interval = BootstrapConfidenceInterval(
            bootstrap_estimates=weighted_bootstrap_estimates,
            confidence_level=confidence_level,
        )
        _, stratum_counts = np.unique(groups, return_counts=True)
        stratum_weights = stratum_counts / n_samples
        classical_confidence_interval = (
            StratifiedClassicalMeanEstimator()
            .estimate(y_true, groups, stratum_weights=stratum_weights)
            .confidence_interval
        )
        n_labeled = int(np.sum(~np.isnan(y_true)))
        effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_labeled,
            n_proxy=n_samples,
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    groups,
    metric_name="Metric",
    confidence_level=0.95,
    n_bootstrap=2000,
    power_tuning=True,
    random_seed=None,
)

Estimate the population mean using Stratified Predict-Then-Debias.

Splits arrays by unique values in groups, applies the PTD bootstrap algorithm within each stratum with a per-stratum power-tuning, and combines the resulting per-stratum bootstrap arrays with weights proportional to the stratum sizes into a single BootstrapConfidenceInterval:

theta = sum_k  w_k * theta_k(lambda_k)

where w_k is the fraction of samples in stratum k and theta_k(lambda_k) is the mean estimate for that stratum computed with power-tuning parameter lambda_k.

Note that this assumes that these fractions reflect the true strata weights in the target data distribution which is important for statistical validity.

Labeled and unlabeled samples are distinguished by NaN in y_true: a sample is labeled if its y_true entry is not NaN.

Parameters:

Name	Type	Description	Default
`y_true`	`NDArray`	Array of observations, shape `(n_samples,)`. Labeled entries are finite; unlabeled entries are `np.nan`.	required
`y_proxy`	`NDArray`	Array of proxy predictions, shape `(n_samples,)`. Must be fully populated (no NaN). Must have nonzero variance within each stratum.	required
`groups`	`NDArray`	Array of stratum identifiers, shape `(n_samples,)`. Unique values define the strata.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval. Defaults to `0.95`.	`0.95`
`n_bootstrap`	`int`	Number of bootstrap resamples. Defaults to `2000`.	`2000`
`power_tuning`	`bool`	If `True` (default), estimate the optimal per-stratum power-tuning parameter `lambda_k` from the bootstrap covariances. If `False`, use `lambda_k = 1.0` for all strata.	`True`
`random_seed`	`int`	Seed for the random number generator, for reproducibility. Defaults to `None` (non-deterministic).	`None`

Returns:

Type	Description
`PredictionPoweredMeanInferenceResult`	Contains the bootstrap-based confidence interval, the metric name, the estimator name (`"StratifiedPTDMeanEstimator"`), and the counts `n_true` (total labeled rows) and `n_proxy` (total dataset size).

Raises:

Type Description

ValueError

If groups contains NaN values (numeric dtype) or None values (non-numeric dtype).
If y_true, y_proxy, and groups do not all have the same length.
If any proxy value is NaN.
If labeled y_true values are constant.
If all proxy values within a stratum are identical (zero variance), which would cause a division by zero when computing the power-tuning parameter.
If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.

Source code in glide/estimators/stratified_ptd.py

def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    groups: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    n_bootstrap: int = 2000,
    power_tuning: bool = True,
    random_seed: Optional[int] = None,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using Stratified Predict-Then-Debias.

    Splits arrays by unique values in ``groups``, applies the PTD bootstrap
    algorithm within each stratum with a per-stratum power-tuning, and
    combines the resulting per-stratum bootstrap arrays with weights proportional
    to the stratum sizes into a single ``BootstrapConfidenceInterval``:

        theta = sum_k  w_k * theta_k(lambda_k)

    where ``w_k`` is the fraction of samples in stratum *k* and ``theta_k(lambda_k)``
    is the mean estimate for that stratum computed with power-tuning parameter
    ``lambda_k``.

    Note that this assumes that these fractions reflect the true strata weights
    in the target data distribution which is important for statistical validity.

    Labeled and unlabeled samples are distinguished by ``NaN`` in ``y_true``:
    a sample is labeled if its ``y_true`` entry is not ``NaN``.

    Parameters
    ----------
    y_true : NDArray
        Array of observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
    y_proxy : NDArray
        Array of proxy predictions, shape ``(n_samples,)``.
        Must be fully populated (no NaN). Must have nonzero variance within each stratum.
    groups : NDArray
        Array of stratum identifiers, shape ``(n_samples,)``. Unique values define the strata.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    n_bootstrap : int, optional
        Number of bootstrap resamples. Defaults to ``2000``.
    power_tuning : bool, optional
        If ``True`` (default), estimate the optimal per-stratum power-tuning parameter
        ``lambda_k`` from the bootstrap covariances. If ``False``, use ``lambda_k = 1.0``
        for all strata.
    random_seed : int, optional
        Seed for the random number generator, for reproducibility.
        Defaults to ``None`` (non-deterministic).

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains the bootstrap-based confidence interval, the metric name,
        the estimator name (``"StratifiedPTDMeanEstimator"``), and the counts
        ``n_true`` (total labeled rows) and ``n_proxy`` (total dataset size).

    Raises
    ------
    ValueError
        - If ``groups`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
        - If ``y_true``, ``y_proxy``, and ``groups`` do not all have the same length.
        - If any proxy value is NaN.
        - If labeled ``y_true`` values are constant.
        - If all proxy values within a stratum are identical (zero variance), which would
          cause a division by zero when computing the power-tuning parameter.
        - If any stratum has fewer than 2 labeled or fewer than 2 unlabeled samples.
    """
    strata = _preprocess(y_true, y_proxy, groups)

    n_samples = len(y_true)
    rng = np.random.default_rng(random_seed)

    weighted_bootstrap_estimates = np.zeros(n_bootstrap)

    for y_true_filtered, y_proxy_labeled, y_proxy_unlabeled in strata:
        stratum_n_labeled, stratum_n_unlabeled = len(y_true_filtered), len(y_proxy_unlabeled)
        stratum_size = stratum_n_labeled + stratum_n_unlabeled
        w_k = stratum_size / n_samples

        mean_proxy_unlabeled_k = np.mean(y_proxy_unlabeled)
        var_proxy_unlabeled_k = np.var(y_proxy_unlabeled, ddof=1) / stratum_n_unlabeled

        bootstrap_y_true_means_k, bootstrap_y_proxy_labeled_means_k = _compute_bootstrap_labeled_means(
            y_true_filtered, y_proxy_labeled, n_bootstrap, rng
        )
        lambda_k = _compute_tuning_parameter(
            bootstrap_y_true_means_k, bootstrap_y_proxy_labeled_means_k, var_proxy_unlabeled_k, power_tuning
        )
        bootstrap_estimates_k = _compute_bootstrap_mean_estimates(
            bootstrap_y_true_means_k,
            bootstrap_y_proxy_labeled_means_k,
            mean_proxy_unlabeled_k,
            var_proxy_unlabeled_k,
            lambda_k,
            rng,
        )

        weighted_bootstrap_estimates += w_k * bootstrap_estimates_k

    confidence_interval = BootstrapConfidenceInterval(
        bootstrap_estimates=weighted_bootstrap_estimates,
        confidence_level=confidence_level,
    )
    _, stratum_counts = np.unique(groups, return_counts=True)
    stratum_weights = stratum_counts / n_samples
    classical_confidence_interval = (
        StratifiedClassicalMeanEstimator()
        .estimate(y_true, groups, stratum_weights=stratum_weights)
        .confidence_interval
    )
    n_labeled = int(np.sum(~np.isnan(y_true)))
    effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_labeled,
        n_proxy=n_samples,
        effective_sample_size=effective_sample_size,
    )
    return result

glide.estimators.clustered_ptd.ClusteredPTDMeanEstimator

Clustered Predict-Then-Debias estimator for population mean.

Extends PTD to datasets where observations are grouped into clusters and each cluster is either entirely labeled or entirely unlabeled. The bootstrap resamples whole clusters rather than individual observations, which accounts for within-cluster correlation and produces valid confidence intervals under cluster sampling designs.

A power-tuning parameter λ is estimated from the joint bootstrap covariance of the labeled cluster means and labeled cluster proxy means. At each bootstrap iteration, clusters are resampled with replacement and their means are averaged, producing bootstrap replicates of the PTD estimate.

References

Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates. "Prediction-powered inference with imputed covariates and nonuniform sampling." arXiv preprint arXiv:2501.18577 (2025).

Examples:

>>> import numpy as np
>>> from glide.estimators import ClusteredPTDMeanEstimator
>>> y_true = np.array([1.0, 2.0, 3.0, 4.0, np.nan, np.nan, np.nan, np.nan])
>>> y_proxy = np.array([1.1, 2.2, 3.1, 3.9, 1.5, 1.8, 4.5, 4.8])
>>> clusters = np.array(["A", "A", "B", "B", "C", "C", "D", "D"])
>>> estimator = ClusteredPTDMeanEstimator()
>>> result = estimator.estimate(y_true, y_proxy, clusters, n_bootstrap=5, random_seed=0)
>>> print(result)
Metric: Metric
Point Estimate: 2.517
Confidence Interval (95%): [1.657, 3.497]
Estimator : ClusteredPTDMeanEstimator
n_true: 4
n_proxy: 8
Effective Sample Size: 6

Source code in glide/estimators/clustered_ptd.py

class ClusteredPTDMeanEstimator:
    """Clustered Predict-Then-Debias estimator for population mean.

    Extends PTD to datasets where observations are grouped into clusters and
    each cluster is either entirely labeled or entirely unlabeled. The bootstrap
    resamples whole clusters rather than individual observations, which accounts
    for within-cluster correlation and produces valid confidence intervals under
    cluster sampling designs.

    A power-tuning parameter λ is estimated from the joint bootstrap covariance
    of the labeled cluster means and labeled cluster proxy means. At each bootstrap
    iteration, clusters are resampled with replacement and their means are averaged,
    producing bootstrap replicates of the PTD estimate.

    References
    ----------
    Kluger, Dan M., Kerri Lu, Tijana Zrnic, Sherrie Wang, and Stephen Bates.
    "Prediction-powered inference with imputed covariates and nonuniform sampling."
    arXiv preprint arXiv:2501.18577 (2025).

    Examples
    --------
    >>> import numpy as np
    >>> from glide.estimators import ClusteredPTDMeanEstimator
    >>> y_true = np.array([1.0, 2.0, 3.0, 4.0, np.nan, np.nan, np.nan, np.nan])
    >>> y_proxy = np.array([1.1, 2.2, 3.1, 3.9, 1.5, 1.8, 4.5, 4.8])
    >>> clusters = np.array(["A", "A", "B", "B", "C", "C", "D", "D"])
    >>> estimator = ClusteredPTDMeanEstimator()
    >>> result = estimator.estimate(y_true, y_proxy, clusters, n_bootstrap=5, random_seed=0)
    >>> print(result)
    Metric: Metric
    Point Estimate: 2.517
    Confidence Interval (95%): [1.657, 3.497]
    Estimator : ClusteredPTDMeanEstimator
    n_true: 4
    n_proxy: 8
    Effective Sample Size: 6
    """

    def estimate(
        self,
        y_true: NDArray,
        y_proxy: NDArray,
        clusters: NDArray,
        metric_name: str = "Metric",
        confidence_level: float = 0.95,
        n_bootstrap: int = 2000,
        power_tuning: bool = True,
        random_seed: Optional[int] = None,
    ) -> PredictionPoweredMeanInferenceResult:
        """Estimate the population mean using the Clustered Predict-Then-Debias bootstrap.

        Computes cluster means for labeled and unlabeled clusters and uses them as
        sampling units to run the PTD bootstrap. The tuning parameter λ and the
        confidence interval are both derived from a bootstrap over the labeled clusters
        only. The sampling variability of the unlabeled proxy mean is approximated by
        a single Gaussian draw per iteration, keeping the per-iteration cost O(M_L),
        where M_L is the number of labeled clusters.

        Labeled and unlabeled clusters are distinguished by the NaN pattern in
        ``y_true``: a cluster is labeled if every one of its ``y_true`` entries is
        finite, and unlabeled if every entry is ``np.nan``. Partially labeled clusters
        are not supported.

        Parameters
        ----------
        y_true : NDArray
            Array of observations, shape ``(n_samples,)``.
            Labeled entries are finite; unlabeled entries are ``np.nan``.
            All observations in the same cluster must share the same label status.
        y_proxy : NDArray
            Array of proxy predictions, shape ``(n_samples,)``.
            Must be fully populated (no NaN).
        clusters : NDArray
            Array of cluster identifiers, shape ``(n_samples,)``.
            Unique values define the clusters.
        metric_name : str, optional
            Human-readable label for the metric. Defaults to ``"Metric"``.
        confidence_level : float, optional
            Target coverage for the confidence interval. Defaults to ``0.95``.
        n_bootstrap : int, optional
            Number of bootstrap resamples. Defaults to ``2000``.
        power_tuning : bool, optional
            If ``True`` (default), estimate the optimal power-tuning parameter
            λ from the bootstrap covariances. If ``False``, use λ = 1.0.
        random_seed : int, optional
            Seed for the random number generator. Defaults to ``None``.

        Returns
        -------
        PredictionPoweredMeanInferenceResult
            Contains the bootstrap-based confidence interval, the metric name,
            the estimator name (``"ClusteredPTDMeanEstimator"``), and the counts
            ``n_true`` (total labeled observations) and ``n_proxy`` (total dataset size).

        Raises
        ------
        ValueError
            - If ``y_true``, ``y_proxy``, and ``clusters`` do not all have the same length.
            - If labeled ``y_true`` values are constant.
            - If any proxy value is NaN.
            - If ``clusters`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
            - If any cluster contains both labeled and unlabeled observations.
            - If fewer than 2 clusters are fully labeled.
            - If fewer than 2 clusters are fully unlabeled.
        """
        (
            labeled_true_means,
            labeled_proxy_means,
            unlabeled_proxy_means,
        ) = _preprocess(y_true, y_proxy, clusters)

        n_unlabeled_clusters = len(unlabeled_proxy_means)
        mean_proxy_unlabeled = np.mean(unlabeled_proxy_means)
        var_proxy_unlabeled = np.var(unlabeled_proxy_means, ddof=1) / n_unlabeled_clusters

        rng = np.random.default_rng(random_seed)

        bootstrap_y_true_means, bootstrap_y_proxy_labeled_means = _compute_bootstrap_labeled_means(
            labeled_true_means, labeled_proxy_means, n_bootstrap, rng
        )
        lambda_ = _compute_tuning_parameter(
            bootstrap_y_true_means, bootstrap_y_proxy_labeled_means, var_proxy_unlabeled, power_tuning
        )
        bootstrap_estimates = _compute_bootstrap_mean_estimates(
            bootstrap_y_true_means,
            bootstrap_y_proxy_labeled_means,
            mean_proxy_unlabeled,
            var_proxy_unlabeled,
            lambda_,
            rng,
        )

        confidence_interval = BootstrapConfidenceInterval(
            bootstrap_estimates=bootstrap_estimates,
            confidence_level=confidence_level,
        )
        n_labeled = np.sum(~np.isnan(y_true))
        classical_confidence_interval = ClusteredClassicalMeanEstimator().estimate(y_true, clusters).confidence_interval
        effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
        result = PredictionPoweredMeanInferenceResult(
            confidence_interval=confidence_interval,
            metric_name=metric_name,
            estimator_name=self.__class__.__name__,
            n_true=n_labeled,
            n_proxy=len(y_proxy),
            effective_sample_size=effective_sample_size,
        )
        return result

estimate

estimate(
    y_true,
    y_proxy,
    clusters,
    metric_name="Metric",
    confidence_level=0.95,
    n_bootstrap=2000,
    power_tuning=True,
    random_seed=None,
)

Estimate the population mean using the Clustered Predict-Then-Debias bootstrap.

Computes cluster means for labeled and unlabeled clusters and uses them as sampling units to run the PTD bootstrap. The tuning parameter λ and the confidence interval are both derived from a bootstrap over the labeled clusters only. The sampling variability of the unlabeled proxy mean is approximated by a single Gaussian draw per iteration, keeping the per-iteration cost O(M_L), where M_L is the number of labeled clusters.

Labeled and unlabeled clusters are distinguished by the NaN pattern in y_true: a cluster is labeled if every one of its y_true entries is finite, and unlabeled if every entry is np.nan. Partially labeled clusters are not supported.

Parameters:

Name	Type	Description	Default
`y_true`	`NDArray`	Array of observations, shape `(n_samples,)`. Labeled entries are finite; unlabeled entries are `np.nan`. All observations in the same cluster must share the same label status.	required
`y_proxy`	`NDArray`	Array of proxy predictions, shape `(n_samples,)`. Must be fully populated (no NaN).	required
`clusters`	`NDArray`	Array of cluster identifiers, shape `(n_samples,)`. Unique values define the clusters.	required
`metric_name`	`str`	Human-readable label for the metric. Defaults to `"Metric"`.	`'Metric'`
`confidence_level`	`float`	Target coverage for the confidence interval. Defaults to `0.95`.	`0.95`
`n_bootstrap`	`int`	Number of bootstrap resamples. Defaults to `2000`.	`2000`
`power_tuning`	`bool`	If `True` (default), estimate the optimal power-tuning parameter λ from the bootstrap covariances. If `False`, use λ = 1.0.	`True`
`random_seed`	`int`	Seed for the random number generator. Defaults to `None`.	`None`

Returns:

Type	Description
`PredictionPoweredMeanInferenceResult`	Contains the bootstrap-based confidence interval, the metric name, the estimator name (`"ClusteredPTDMeanEstimator"`), and the counts `n_true` (total labeled observations) and `n_proxy` (total dataset size).

Raises:

Type	Description
`ValueError`	If `y_true`, `y_proxy`, and `clusters` do not all have the same length. If labeled `y_true` values are constant. If any proxy value is NaN. If `clusters` contains NaN values (numeric dtype) or None values (non-numeric dtype). If any cluster contains both labeled and unlabeled observations. If fewer than 2 clusters are fully labeled. If fewer than 2 clusters are fully unlabeled.

Source code in glide/estimators/clustered_ptd.py

def estimate(
    self,
    y_true: NDArray,
    y_proxy: NDArray,
    clusters: NDArray,
    metric_name: str = "Metric",
    confidence_level: float = 0.95,
    n_bootstrap: int = 2000,
    power_tuning: bool = True,
    random_seed: Optional[int] = None,
) -> PredictionPoweredMeanInferenceResult:
    """Estimate the population mean using the Clustered Predict-Then-Debias bootstrap.

    Computes cluster means for labeled and unlabeled clusters and uses them as
    sampling units to run the PTD bootstrap. The tuning parameter λ and the
    confidence interval are both derived from a bootstrap over the labeled clusters
    only. The sampling variability of the unlabeled proxy mean is approximated by
    a single Gaussian draw per iteration, keeping the per-iteration cost O(M_L),
    where M_L is the number of labeled clusters.

    Labeled and unlabeled clusters are distinguished by the NaN pattern in
    ``y_true``: a cluster is labeled if every one of its ``y_true`` entries is
    finite, and unlabeled if every entry is ``np.nan``. Partially labeled clusters
    are not supported.

    Parameters
    ----------
    y_true : NDArray
        Array of observations, shape ``(n_samples,)``.
        Labeled entries are finite; unlabeled entries are ``np.nan``.
        All observations in the same cluster must share the same label status.
    y_proxy : NDArray
        Array of proxy predictions, shape ``(n_samples,)``.
        Must be fully populated (no NaN).
    clusters : NDArray
        Array of cluster identifiers, shape ``(n_samples,)``.
        Unique values define the clusters.
    metric_name : str, optional
        Human-readable label for the metric. Defaults to ``"Metric"``.
    confidence_level : float, optional
        Target coverage for the confidence interval. Defaults to ``0.95``.
    n_bootstrap : int, optional
        Number of bootstrap resamples. Defaults to ``2000``.
    power_tuning : bool, optional
        If ``True`` (default), estimate the optimal power-tuning parameter
        λ from the bootstrap covariances. If ``False``, use λ = 1.0.
    random_seed : int, optional
        Seed for the random number generator. Defaults to ``None``.

    Returns
    -------
    PredictionPoweredMeanInferenceResult
        Contains the bootstrap-based confidence interval, the metric name,
        the estimator name (``"ClusteredPTDMeanEstimator"``), and the counts
        ``n_true`` (total labeled observations) and ``n_proxy`` (total dataset size).

    Raises
    ------
    ValueError
        - If ``y_true``, ``y_proxy``, and ``clusters`` do not all have the same length.
        - If labeled ``y_true`` values are constant.
        - If any proxy value is NaN.
        - If ``clusters`` contains NaN values (numeric dtype) or None values (non-numeric dtype).
        - If any cluster contains both labeled and unlabeled observations.
        - If fewer than 2 clusters are fully labeled.
        - If fewer than 2 clusters are fully unlabeled.
    """
    (
        labeled_true_means,
        labeled_proxy_means,
        unlabeled_proxy_means,
    ) = _preprocess(y_true, y_proxy, clusters)

    n_unlabeled_clusters = len(unlabeled_proxy_means)
    mean_proxy_unlabeled = np.mean(unlabeled_proxy_means)
    var_proxy_unlabeled = np.var(unlabeled_proxy_means, ddof=1) / n_unlabeled_clusters

    rng = np.random.default_rng(random_seed)

    bootstrap_y_true_means, bootstrap_y_proxy_labeled_means = _compute_bootstrap_labeled_means(
        labeled_true_means, labeled_proxy_means, n_bootstrap, rng
    )
    lambda_ = _compute_tuning_parameter(
        bootstrap_y_true_means, bootstrap_y_proxy_labeled_means, var_proxy_unlabeled, power_tuning
    )
    bootstrap_estimates = _compute_bootstrap_mean_estimates(
        bootstrap_y_true_means,
        bootstrap_y_proxy_labeled_means,
        mean_proxy_unlabeled,
        var_proxy_unlabeled,
        lambda_,
        rng,
    )

    confidence_interval = BootstrapConfidenceInterval(
        bootstrap_estimates=bootstrap_estimates,
        confidence_level=confidence_level,
    )
    n_labeled = np.sum(~np.isnan(y_true))
    classical_confidence_interval = ClusteredClassicalMeanEstimator().estimate(y_true, clusters).confidence_interval
    effective_sample_size = floor(n_labeled * classical_confidence_interval.var / confidence_interval.var)
    result = PredictionPoweredMeanInferenceResult(
        confidence_interval=confidence_interval,
        metric_name=metric_name,
        estimator_name=self.__class__.__name__,
        n_true=n_labeled,
        n_proxy=len(y_proxy),
        effective_sample_size=effective_sample_size,
    )
    return result