Source code for nnsa.stats.intervals

"""
Code related to confindence and prediction intervals.
"""

import numpy as np
import scipy.stats

__all__ = [
    'prediction_interval',
]


[docs]def prediction_interval(x, y_true, y_pred, x_query, alpha=0.05, dof=None):
    """
    Compute the prediction intervals for the given linear 1D regression model at the given query points.

    Args:
        x (np.ndarray): 1D array with the predictor values of the training dataset (array with length n_samples).
        y_true (np.ndarray): 1D array with the true outcome values of the training dataset.
        y_pred (np.ndarray): 1D array with the predicted outcome values (from the fitted model) of the training dataset.
        x_query (np.ndarray): the x-points at which to compute the prediction interval.
        alpha (float, optional): the significance determining the prediction interval (PI).
            PI = (1 - alpha)*100%
            Defaults to 0.05.
        dof (int, optional): the degrees of freedom left after fitting the model.
            If None, a linear regression model with one predicter and an intercept is assumed, corresponding to
            dof = n_samples - 2.
            Defaults to None.

    Returns:
        delta_y (np.ndarray): array with same length as x_query containing the prediction margin.
            The prediction interval is: y +/- delta_y.

    Examples:
        The following example case is taken from https://newonlinecourses.science.psu.edu/stat462/node/128/.
        >>> import pandas as pd
        >>> from sklearn.linear_model import LinearRegression

        Read data file.
        >>> datafile = 'https://newonlinecourses.science.psu.edu/stat462/sites/onlinecourses.science.psu.edu.stat462/files/data/infectionrisk/index.txt'
        >>> df = pd.read_csv(datafile, delimiter='\\t')

        >>> df = df[(df['Region'] == 1) | (df['Region'] == 2)]  # Only keep region 1 and 2.
        >>> df = df[df['Stay'] <= 14]  # Remove 2 outliers.
        >>> x = df['Stay'].values.reshape(-1, 1)
        >>> y_true = df['InfctRsk'].values

        # Fit model (with an intercept).
        >>> model = LinearRegression().fit(x, y_true)
        >>> y_pred = model.predict(x)

        # Get prediction interval at x = 10.
        >>> x_query = np.array([10])
        >>> delta_y = prediction_interval(x[:, 0], y_true, y_pred, x_query, alpha=0.05)
        >>> print('{:.4f}'.format(delta_y[0]))
        2.0699
    """
    def check_x_input(a, var_name):
        if a.ndim > 1:
            if a.shape[0] == np.size(a):
                # Extra dimensions are redundant. Flatten the array.
                a = a.flatten()
            else:
                raise ValueError('Unexpected ndim for `{0}` ({0}.shape = {1}).'
                                 '`{0}` should be a one-dimensional array.'.format(var_name, a.shape))
        return a

    x = check_x_input(x, var_name='x')
    x_query = check_x_input(x_query, var_name='x_query')

    n = len(x)

    if dof is None:
        # Assume the underlying model that predicted y_pred is a linear regression model with an intercept,
        # corresponding to dof = n - 2.
        dof = n - 2

    # Compute mean squared error.
    error = y_pred - y_true
    mse = (1/dof) * np.sum(error**2)

    # Compute the term in the square root.
    x_mean = np.mean(x)
    x_demean = x - x_mean
    x_q_demean = x_query - x_mean
    term = mse * (1 + 1/n + (x_q_demean**2)/(np.sum(x_demean**2)))

    # Critical two-sided t-value.
    p = 1 - alpha/2
    t_score = scipy.stats.t.ppf(p, dof)

    # Compute the prediction interval.
    delta_y = t_score * np.sqrt(term)

    return delta_y