InĀ [31]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
le_df = pd.read_csv("life_expectancy.csv") 
le_df.columns = le_df.columns.str.strip()
InĀ [8]:
sns.boxplot(x=le_df['Life expectancy'])            # inspecting distribution of target variable using a boxplot
plt.show()
No description has been provided for this image
InĀ [43]:
le_df.head()
InĀ [34]:
cols_to_drop = [
    'Country',
    'Year',
    'Adult Mortality',
    'GDP',
    'Income composition of resources',
    'Status',
    'thinness  1-19 years',
    'thinness 5-9 years',
    'HIV/AIDS',
    'Diphtheria',
    'under-five deaths',
    'Polio'
]

le_df = le_df.drop(cols_to_drop, axis=1)

le_df = le_df.dropna() # drops nulls
InĀ [35]:
corr_mat = le_df.corr()                               # compute the correlation matrix

mask = np.zeros_like(corr_mat, dtype=bool)            # mask out upper triangle (correlation matrices are symmetric)
mask[np.triu_indices_from(mask)] = True


f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)   # generate custom colormap
sns.heatmap(corr_mat, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()
No description has been provided for this image
InĀ [36]:
sns.pairplot(le_df)
plt.show()
No description has been provided for this image
InĀ [56]:
X = le_df.iloc[:, 1:]
Y = le_df.iloc[:, 0]
InĀ [57]:
X, Y
Out[57]:
(      infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles  \
 0                62     0.01               71.279624         65.0     1154   
 1                64     0.01               73.523582         62.0      492   
 2                66     0.01               73.219243         64.0      430   
 3                69     0.01               78.184215         67.0     2787   
 4                71     0.01                7.097109         68.0     3013   
 ...             ...      ...                     ...          ...      ...   
 2933             27     4.36                0.000000         68.0       31   
 2934             26     4.06                0.000000          7.0      998   
 2935             25     4.43                0.000000         73.0      304   
 2936             25     1.72                0.000000         76.0      529   
 2937             24     1.68                0.000000         79.0     1483   
 
        BMI  Total expenditure  Population  Schooling  
 0     19.1               8.16  33736494.0       10.1  
 1     18.6               8.18    327582.0       10.0  
 2     18.1               8.13  31731688.0        9.9  
 3     17.6               8.52   3696958.0        9.8  
 4     17.2               7.87   2978599.0        9.5  
 ...    ...                ...         ...        ...  
 2933  27.1               7.13  12777511.0        9.2  
 2934  26.7               6.52  12633897.0        9.5  
 2935  26.3               6.53    125525.0       10.0  
 2936  25.9               6.16  12366165.0        9.8  
 2937  25.5               7.10  12222251.0        9.8  
 
 [1657 rows x 9 columns],
 0       65.0
 1       59.9
 2       59.9
 3       59.5
 4       59.2
         ... 
 2933    44.3
 2934    44.5
 2935    44.8
 2936    45.3
 2937    46.0
 Name: Life expectancy, Length: 1657, dtype: float64)
InĀ [58]:
sns.pairplot(X)
plt.show()
No description has been provided for this image
InĀ [59]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
scaled_X = scaler.transform(X)

print(f"Feature Means: {scaled_X.mean(axis=0)}")               # all effectively equal to 0
print(f"Feature Variances: {scaled_X.var(axis=0)}")            # all effectively equal to 1
Feature Means: [-1.71525102e-17 -1.02915061e-16  1.11491316e-16 -8.46905192e-17
  0.00000000e+00  2.22982633e-16  2.80872355e-16 -1.28643827e-17
  6.86100409e-16]
Feature Variances: [1. 1. 1. 1. 1. 1. 1. 1. 1.]
InĀ [72]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

lambdas = [0.01,0.1,0.5,1,1.5, 2,5,10, 20, 30,50, 100, 200, 300,1000,10000, 100000]
N = len(lambdas)
coefs_mat = np.zeros((scaled_X.shape[1], N))
for i in range(N):
    L = lambdas[i]
    ridge_lm = Ridge(alpha=L).fit(scaled_X, Y)
    coefs_mat[:,i] = ridge_lm.coef_

colors = ['red', 'brown', 'green', 'blue', 'orange', 'pink', 'purple', 'grey', 'yellow', 'violet']
plt.figure(figsize=(10,10))
for i in range(X.shape[1]):
    lab = "X" + str(i + 1)
    plt.plot(np.log(lambdas), coefs_mat[i], label=lab, color=colors[i])
    plt.legend()
plt.xlabel(r"log($\lambda$)")
plt.ylabel("Estimated Coefficient")
plt.show()
No description has been provided for this image
InĀ [207]:
lambdas = np.arange(0, 50.1, step=0.1)
n = scaled_X.shape[0]
N = lambdas.shape[0]
CV_score = np.zeros(N)
curIdx = 0
#Y = Y.to_numpy()
for L in lambdas:
    sq_errs = 0.
    for i in range(n):
        x_i = scaled_X[i]
        x_removed_i = np.delete(scaled_X, i, axis=0)      # dataset without i-th observation (loocv)
        y_i = Y[i]
        y_removed_i = np.delete(Y, i, axis=0)

        mod = Ridge(alpha=L).fit(x_removed_i, y_removed_i)
        sq_errs += ((mod.predict(x_i.reshape(1,-1))-y_i)**2).item()

    CV_score[curIdx] = sq_errs/n
    curIdx += 1

min_idx = np.argmin(CV_score)
plt.plot(lambdas, CV_score)
plt.xlabel(r"$\lambda$")
plt.ylabel("LOOCV Score (Ridge)")
plt.axvline(x=lambdas[min_idx], color="red")
plt.annotate(f"$\\lambda = {lambdas[min_idx]}$", xy=(10,32.750))
plt.show()
No description has been provided for this image
InĀ [211]:
rs = np.random.RandomState(123)
from sklearn.metrics import r2_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error 

train_prop = 0.7   
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_prop, random_state=rs)

# cv is about choosing the correct alphas, and this selection will be skewed if the data is not normalised.
# furthermore, lasso and ridge are also affected by scaling.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
###########################################


# fit models to train data
m_linearReg = LinearRegression().fit(X_train, y_train)
m_lasso = Lasso(0.0).fit(X_train, y_train)
m_ridge = Ridge(7.7).fit(X_train, y_train)

# compute predictions of each model on the train data
ypred_train_linearReg = m_linearReg.predict(X_train)
ypred_train_lasso = m_lasso.predict(X_train)
ypred_train_ridge = m_ridge.predict(X_train)

# compute predictions of each model on the test data
ypred_test_linearReg = m_linearReg.predict(X_test)
ypred_test_lasso = m_lasso.predict(X_test)
ypred_test_ridge = m_ridge.predict(X_test)

# first compute metrics and store them in array
loss_data = [
    ['linear regression', 
     mean_squared_error(y_train, ypred_train_linearReg), 
     mean_squared_error(y_test, ypred_test_linearReg), 
     mean_absolute_error(y_train, ypred_train_linearReg), 
     mean_absolute_error(y_test, ypred_test_linearReg),
     0,
     r2_score(ypred_train_linearReg, y_train),
     r2_score(ypred_test_linearReg, y_test),
    ],
    ['lasso', 
     mean_squared_error(y_train, ypred_train_lasso), 
     mean_squared_error(y_test, ypred_test_lasso),
     mean_absolute_error(y_train, ypred_train_lasso), 
     mean_absolute_error(y_test, ypred_test_lasso),
     m_lasso.alpha,
     r2_score(ypred_train_lasso, y_train),
     r2_score(ypred_test_lasso, y_test),
    ],
    ['ridge', 
     mean_squared_error(y_train, ypred_train_ridge), 
     mean_squared_error(y_test, ypred_test_ridge),
     mean_absolute_error(y_train, ypred_train_ridge), 
     mean_absolute_error(y_test, ypred_test_ridge),
     m_ridge.alpha,
     r2_score(ypred_train_ridge, y_train),
     r2_score(ypred_test_ridge, y_test),
    ]
]

# create a pandas dataframe and feed it the array and naming the columns
loss_df = pd.DataFrame(loss_data, 
                       columns=['Model', 'train MSE', 'test MSE', 'train MAE', 'test MAE', 'regularisation', 'r2 train', 'r2 test'])


m_lassoCV = LassoCV(alphas=np.logspace(-10, 10, 1000)).fit(X_train, y_train)              
ypred_train_lassoCV = m_lassoCV.predict(X_train)
ypred_test_lassoCV = m_lassoCV.predict(X_test)

# add row to loss_df
lassoCV_row = {'Model': 'lasso CV', 
               'train MSE': mean_squared_error(y_train, ypred_train_lassoCV), 
               'test MSE': mean_squared_error(y_test, ypred_test_lassoCV),
               'train MAE': mean_absolute_error(y_train, ypred_train_lassoCV), 
               'test MAE' :mean_absolute_error(y_test, ypred_test_lassoCV),
               'regularisation': m_lassoCV.alpha_,
               'r2 train': r2_score(ypred_train_lassoCV, y_train),
               'r2 test': r2_score(ypred_test_lassoCV, y_test),
              }

m_ridgeCV = RidgeCV(cv=None,alphas=np.logspace(-20, 20, 1001)).fit(X_train, y_train)                
ypred_train_ridgeCV = m_ridgeCV.predict(X_train)
ypred_test_ridgeCV = m_ridgeCV.predict(X_test)

# add row to loss_df
ridgeCV_row = {'Model': 'ridge CV', 
               'train MSE': mean_squared_error(y_train, ypred_train_ridgeCV), 
               'test MSE': mean_squared_error(y_test, ypred_test_ridgeCV),
               'train MAE': mean_absolute_error(y_train, ypred_train_ridgeCV), 
               'test MAE' :mean_absolute_error(y_test, ypred_test_ridgeCV),
               'regularisation': m_ridgeCV.alpha_,
               'r2 train': r2_score(ypred_train_ridgeCV, y_train),
               'r2 test': r2_score(ypred_test_ridgeCV, y_test),}

pd.concat([loss_df, pd.DataFrame([lassoCV_row]), pd.DataFrame([ridgeCV_row])], ignore_index=True)
Out[211]:
Model train MSE test MSE train MAE test MAE regularisation r2 train r2 test
0 linear regression 33.336502 30.081001 4.428437 4.140409 0.000000 0.253466 0.291035
1 lasso 33.336502 30.081001 4.428437 4.140409 0.000000 0.253466 0.291035
2 ridge 33.339160 30.105262 4.426888 4.139568 7.700000 0.245652 0.283138
3 lasso CV 33.403707 30.189396 4.426494 4.135041 0.067426 0.227865 0.265868
4 ridge CV 33.340938 30.113315 4.426602 4.139361 10.000000 0.243316 0.280778

In both cases (lasso, ridge) with manual alphas chosen from performing the manual LOOCV from below, we beat the scikit-learn LassoCV and RidgeCV functions.

InĀ [162]:
lambdas = [0.01,0.1,0.5,1,1.5, 2,5,10, 20, 30,50, 100, 200, 300]
N = len(lambdas)
coefs_mat = np.zeros((scaled_X.shape[1], N))
for i in range(N):
    L = lambdas[i]
    lasso_lm = Lasso(alpha=L).fit(scaled_X,Y)
    coefs_mat[:,i] = lasso_lm.coef_

colors = ['red', 'brown', 'green', 'blue', 'orange', 'pink', 'purple', 'grey', 'yellow', 'violet']
plt.figure(figsize=(10,10))
for i in range(X.shape[1]):
    lab = "X" + str(i + 1)
    plt.plot(np.log(lambdas), coefs_mat[i], label=lab, color=colors[i])
    plt.legend()
plt.xlabel(r"log($\lambda$)")
plt.ylabel("Estimated Coefficient")
plt.show()
No description has been provided for this image
InĀ [166]:
import warnings
warnings.filterwarnings("ignore")

lambdas = np.arange(0, 20.1, step=0.1)
n = scaled_X.shape[0]
N = lambdas.shape[0]
CV_score = np.zeros(N)
curIdx = 0
for L in lambdas:
    sq_errs = 0.
    for i in range(n):
        x_i = scaled_X[i]
        x_removed_i = np.delete(scaled_X, i, axis=0)      # dataset without i-th observation
        y_i = Y[i]
        y_removed_i = np.delete(Y, i, axis=0)

        mod = Lasso(alpha=L).fit(x_removed_i, y_removed_i, )
        sq_errs += ((mod.predict(x_i.reshape(1,-1))-y_i)**2).item()

    CV_score[curIdx] = sq_errs/n
    curIdx += 1

min_idx = np.argmin(CV_score)
plt.plot(lambdas, CV_score)
plt.xlabel(r"$\lambda$")
plt.ylabel("LOOCV Score (LASSO)")
plt.axvline(x=lambdas[min_idx], color="red")
plt.annotate(f"$\\lambda = {lambdas[min_idx]}$", xy=(10,50))
plt.show()
No description has been provided for this image