Source code for nnsa.stats.intervals

"""
Code related to confindence and prediction intervals.
"""

import numpy as np
import scipy.stats

__all__ = [
    'prediction_interval',
]


[docs]def prediction_interval(x, y_true, y_pred, x_query, alpha=0.05, dof=None): """ Compute the prediction intervals for the given linear 1D regression model at the given query points. Args: x (np.ndarray): 1D array with the predictor values of the training dataset (array with length n_samples). y_true (np.ndarray): 1D array with the true outcome values of the training dataset. y_pred (np.ndarray): 1D array with the predicted outcome values (from the fitted model) of the training dataset. x_query (np.ndarray): the x-points at which to compute the prediction interval. alpha (float, optional): the significance determining the prediction interval (PI). PI = (1 - alpha)*100% Defaults to 0.05. dof (int, optional): the degrees of freedom left after fitting the model. If None, a linear regression model with one predicter and an intercept is assumed, corresponding to dof = n_samples - 2. Defaults to None. Returns: delta_y (np.ndarray): array with same length as x_query containing the prediction margin. The prediction interval is: y +/- delta_y. Examples: The following example case is taken from https://newonlinecourses.science.psu.edu/stat462/node/128/. >>> import pandas as pd >>> from sklearn.linear_model import LinearRegression Read data file. >>> datafile = 'https://newonlinecourses.science.psu.edu/stat462/sites/onlinecourses.science.psu.edu.stat462/files/data/infectionrisk/index.txt' >>> df = pd.read_csv(datafile, delimiter='\\t') >>> df = df[(df['Region'] == 1) | (df['Region'] == 2)] # Only keep region 1 and 2. >>> df = df[df['Stay'] <= 14] # Remove 2 outliers. >>> x = df['Stay'].values.reshape(-1, 1) >>> y_true = df['InfctRsk'].values # Fit model (with an intercept). >>> model = LinearRegression().fit(x, y_true) >>> y_pred = model.predict(x) # Get prediction interval at x = 10. >>> x_query = np.array([10]) >>> delta_y = prediction_interval(x[:, 0], y_true, y_pred, x_query, alpha=0.05) >>> print('{:.4f}'.format(delta_y[0])) 2.0699 """ def check_x_input(a, var_name): if a.ndim > 1: if a.shape[0] == np.size(a): # Extra dimensions are redundant. Flatten the array. a = a.flatten() else: raise ValueError('Unexpected ndim for `{0}` ({0}.shape = {1}).' '`{0}` should be a one-dimensional array.'.format(var_name, a.shape)) return a x = check_x_input(x, var_name='x') x_query = check_x_input(x_query, var_name='x_query') n = len(x) if dof is None: # Assume the underlying model that predicted y_pred is a linear regression model with an intercept, # corresponding to dof = n - 2. dof = n - 2 # Compute mean squared error. error = y_pred - y_true mse = (1/dof) * np.sum(error**2) # Compute the term in the square root. x_mean = np.mean(x) x_demean = x - x_mean x_q_demean = x_query - x_mean term = mse * (1 + 1/n + (x_q_demean**2)/(np.sum(x_demean**2))) # Critical two-sided t-value. p = 1 - alpha/2 t_score = scipy.stats.t.ppf(p, dof) # Compute the prediction interval. delta_y = t_score * np.sqrt(term) return delta_y