"""
Code related to confindence and prediction intervals.
"""
import numpy as np
import scipy.stats
__all__ = [
'prediction_interval',
]
[docs]def prediction_interval(x, y_true, y_pred, x_query, alpha=0.05, dof=None):
"""
Compute the prediction intervals for the given linear 1D regression model at the given query points.
Args:
x (np.ndarray): 1D array with the predictor values of the training dataset (array with length n_samples).
y_true (np.ndarray): 1D array with the true outcome values of the training dataset.
y_pred (np.ndarray): 1D array with the predicted outcome values (from the fitted model) of the training dataset.
x_query (np.ndarray): the x-points at which to compute the prediction interval.
alpha (float, optional): the significance determining the prediction interval (PI).
PI = (1 - alpha)*100%
Defaults to 0.05.
dof (int, optional): the degrees of freedom left after fitting the model.
If None, a linear regression model with one predicter and an intercept is assumed, corresponding to
dof = n_samples - 2.
Defaults to None.
Returns:
delta_y (np.ndarray): array with same length as x_query containing the prediction margin.
The prediction interval is: y +/- delta_y.
Examples:
The following example case is taken from https://newonlinecourses.science.psu.edu/stat462/node/128/.
>>> import pandas as pd
>>> from sklearn.linear_model import LinearRegression
Read data file.
>>> datafile = 'https://newonlinecourses.science.psu.edu/stat462/sites/onlinecourses.science.psu.edu.stat462/files/data/infectionrisk/index.txt'
>>> df = pd.read_csv(datafile, delimiter='\\t')
>>> df = df[(df['Region'] == 1) | (df['Region'] == 2)] # Only keep region 1 and 2.
>>> df = df[df['Stay'] <= 14] # Remove 2 outliers.
>>> x = df['Stay'].values.reshape(-1, 1)
>>> y_true = df['InfctRsk'].values
# Fit model (with an intercept).
>>> model = LinearRegression().fit(x, y_true)
>>> y_pred = model.predict(x)
# Get prediction interval at x = 10.
>>> x_query = np.array([10])
>>> delta_y = prediction_interval(x[:, 0], y_true, y_pred, x_query, alpha=0.05)
>>> print('{:.4f}'.format(delta_y[0]))
2.0699
"""
def check_x_input(a, var_name):
if a.ndim > 1:
if a.shape[0] == np.size(a):
# Extra dimensions are redundant. Flatten the array.
a = a.flatten()
else:
raise ValueError('Unexpected ndim for `{0}` ({0}.shape = {1}).'
'`{0}` should be a one-dimensional array.'.format(var_name, a.shape))
return a
x = check_x_input(x, var_name='x')
x_query = check_x_input(x_query, var_name='x_query')
n = len(x)
if dof is None:
# Assume the underlying model that predicted y_pred is a linear regression model with an intercept,
# corresponding to dof = n - 2.
dof = n - 2
# Compute mean squared error.
error = y_pred - y_true
mse = (1/dof) * np.sum(error**2)
# Compute the term in the square root.
x_mean = np.mean(x)
x_demean = x - x_mean
x_q_demean = x_query - x_mean
term = mse * (1 + 1/n + (x_q_demean**2)/(np.sum(x_demean**2)))
# Critical two-sided t-value.
p = 1 - alpha/2
t_score = scipy.stats.t.ppf(p, dof)
# Compute the prediction interval.
delta_y = t_score * np.sqrt(term)
return delta_y