import numpy as np
np.float = float  # Monkey patch to fix compatibility error with sklearn's linear_model
np.int = int

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import load_diabetes
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats

diabetes = load_diabetes()

diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286377, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04687948,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452837, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00421986,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
        128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
        150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
        200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
         42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
         83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
        104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
        173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
        107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
         60., 174., 259., 178., 128.,  96., 126., 288.,  88., 292.,  71.,
        197., 186.,  25.,  84.,  96., 195.,  53., 217., 172., 131., 214.,
         59.,  70., 220., 268., 152.,  47.,  74., 295., 101., 151., 127.,
        237., 225.,  81., 151., 107.,  64., 138., 185., 265., 101., 137.,
        143., 141.,  79., 292., 178.,  91., 116.,  86., 122.,  72., 129.,
        142.,  90., 158.,  39., 196., 222., 277.,  99., 196., 202., 155.,
         77., 191.,  70.,  73.,  49.,  65., 263., 248., 296., 214., 185.,
         78.,  93., 252., 150.,  77., 208.,  77., 108., 160.,  53., 220.,
        154., 259.,  90., 246., 124.,  67.,  72., 257., 262., 275., 177.,
         71.,  47., 187., 125.,  78.,  51., 258., 215., 303., 243.,  91.,
        150., 310., 153., 346.,  63.,  89.,  50.,  39., 103., 308., 116.,
        145.,  74.,  45., 115., 264.,  87., 202., 127., 182., 241.,  66.,
         94., 283.,  64., 102., 200., 265.,  94., 230., 181., 156., 233.,
         60., 219.,  80.,  68., 332., 248.,  84., 200.,  55.,  85.,  89.,
         31., 129.,  83., 275.,  65., 198., 236., 253., 124.,  44., 172.,
        114., 142., 109., 180., 144., 163., 147.,  97., 220., 190., 109.,
        191., 122., 230., 242., 248., 249., 192., 131., 237.,  78., 135.,
        244., 199., 270., 164.,  72.,  96., 306.,  91., 214.,  95., 216.,
        263., 178., 113., 200., 139., 139.,  88., 148.,  88., 243.,  71.,
         77., 109., 272.,  60.,  54., 221.,  90., 311., 281., 182., 321.,
         58., 262., 206., 233., 242., 123., 167.,  63., 197.,  71., 168.,
        140., 217., 121., 235., 245.,  40.,  52., 104., 132.,  88.,  69.,
        219.,  72., 201., 110.,  51., 277.,  63., 118.,  69., 273., 258.,
         43., 198., 242., 232., 175.,  93., 168., 275., 293., 281.,  72.,
        140., 189., 181., 209., 136., 261., 113., 131., 174., 257.,  55.,
         84.,  42., 146., 212., 233.,  91., 111., 152., 120.,  67., 310.,
         94., 183.,  66., 173.,  72.,  49.,  64.,  48., 178., 104., 132.,
        220.,  57.]),
 'frame': None,
 'DESCR': '.. _diabetes_dataset:\n\nDiabetes dataset\n----------------\n\nTen baseline variables, age, sex, body mass index, average blood\npressure, and six blood serum measurements were obtained for each of n =\n442 diabetes patients, as well as the response of interest, a\nquantitative measure of disease progression one year after baseline.\n\n**Data Set Characteristics:**\n\n  :Number of Instances: 442\n\n  :Number of Attributes: First 10 columns are numeric predictive values\n\n  :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n\n  :Attribute Information:\n      - age     age in years\n      - sex\n      - bmi     body mass index\n      - bp      average blood pressure\n      - s1      tc, T-Cells (a type of white blood cells)\n      - s2      ldl, low-density lipoproteins\n      - s3      hdl, high-density lipoproteins\n      - s4      tch, thyroid stimulating hormone\n      - s5      ltg, lamotrigine\n      - s6      glu, blood sugar level\n\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n\nSource URL:\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n\nFor more information see:\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)',
 'feature_names': ['age',
  'sex',
  'bmi',
  'bp',
  's1',
  's2',
  's3',
  's4',
  's5',
  's6'],
 'data_filename': '/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/sklearn/datasets/data/diabetes_data.csv.gz',
 'target_filename': '/cm/shared/apps/spack/cpu/opt/spack/linux-centos8-zen/gcc-8.3.1/anaconda3-2020.11-da3i7hmt6bdqbmuzq6pyt7kbm47wyrjp/lib/python3.8/site-packages/sklearn/datasets/data/diabetes_target.csv.gz'}

print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, T-Cells (a type of white blood cells)
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, thyroid stimulating hormone
      - s5      ltg, lamotrigine
      - s6      glu, blood sugar level

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.
(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)

# Extract column corresponding to BMI from data set and convert to (n x 1) arrays
bmi              = diabetes.data[:, np.newaxis, 2]
disease_progress = diabetes.target

# Create and fit the model
regr = linear_model.LinearRegression()
regr.fit(bmi, disease_progress)

# Apply the model (predict the disease progression from BMI using linear model)
disease_progress_pred = regr.predict(bmi)

print('Intercept: ', regr.intercept_)
print('Coefficient: ', regr.coef_[0])
print('Variance score (R2): %.2f' % r2_score(disease_progress, disease_progress_pred))

Intercept:  152.1334841628967
Coefficient:  949.4352603839491
Variance score (R2): 0.34

plt.scatter(bmi, disease_progress,  color='black')
plt.plot(bmi, disease_progress_pred, color='blue', linewidth=3)
plt.xlabel('BMI')
plt.ylabel('disease progression')

plt.show()

bmi_train = bmi[:-20] # All but last 20 elements
bmi_test  = bmi[-20:] # Last 20 elements

disease_progress_train = disease_progress[:-20] # All but last 20 elements
disease_progress_test  = disease_progress[-20:] # Last 20 elements

# Create and fit the model
regr = linear_model.LinearRegression()
regr.fit(bmi_train, disease_progress_train)

# Apply the model
disease_progress_pred = regr.predict(bmi_test)

print('Intercept: ', regr.intercept_)
print('Coefficient: ', regr.coef_[0])
print('Variance score (R2): %.2f' % r2_score(disease_progress_test, disease_progress_pred))

Intercept:  152.91886182616167
Coefficient:  938.2378612512636
Variance score (R2): 0.47

plt.scatter(bmi_test, disease_progress_test,  color='black')
plt.plot(bmi_test, disease_progress_pred, color='blue', linewidth=3)
plt.xlabel('BMI')
plt.ylabel('disease progression')

plt.show()

# Create and fit the model
regr = linear_model.LinearRegression()
regr.fit(diabetes.data, diabetes.target)

# Apply the model (predict the disease progression from BMI using linear model)
disease_progress_pred = regr.predict(diabetes.data)

print('Intercept: ', regr.intercept_)
print('Coefficient: ', regr.coef_)
print('Variance score (R2): %.2f' % r2_score(diabetes.target, disease_progress_pred))

Intercept:  152.1334841628965
Coefficient:  [ -10.01219782 -239.81908937  519.83978679  324.39042769 -792.18416163
  476.74583782  101.04457032  177.06417623  751.27932109   67.62538639]
Variance score (R2): 0.52

#import pandas as pd
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
disease_progress = pd.DataFrame(diabetes.target)

df.head(3)

disease_progress.head(3)

regr = linear_model.LinearRegression()
regr.fit(df, disease_progress)
disease_progress_pred = regr.predict(df)

print('Intercept: ', regr.intercept_)
print('Coefficient: ', regr.coef_[0])
print('Variance score (R2): %.2f' % r2_score(disease_progress, disease_progress_pred))

Intercept:  [152.13348416]
Coefficient:  [ -10.01219782 -239.81908937  519.83978679  324.39042769 -792.18416163
  476.74583782  101.04457032  177.06417623  751.27932109   67.62538639]
Variance score (R2): 0.52

regr = linear_model.LinearRegression()
regr.fit(df[["age", "bmi"]], disease_progress)
disease_progress_pred = regr.predict(df[["age", "bmi"]])

print('Intercept: ', regr.intercept_)
print('Coefficient: ', regr.coef_[0])
print('Variance score (R2): %.2f' % r2_score(disease_progress, disease_progress_pred))

Intercept:  [152.13348416]
Coefficient:  [133.01372901 924.81645876]
Variance score (R2): 0.35

regr = linear_model.LinearRegression()
regr.fit(df[["bmi"]][:-20], disease_progress[:-20])
disease_progress_pred = regr.predict(df[["bmi"]][-20:])

print('Intercept: ', regr.intercept_)
print('Coefficient: ', regr.coef_[0])
print('Variance score (R2): %.2f' % r2_score(disease_progress[-20:], disease_progress_pred[-20:]))

Intercept:  [152.91886183]
Coefficient:  [938.23786125]
Variance score (R2): 0.47

for feature in df.columns.values:
    print("Feature: ", feature)
    regr = linear_model.LinearRegression()
    regr.fit(df[[feature]], disease_progress)
    disease_progress_pred = regr.predict(df[[feature]])

    print('Intercept: ', regr.intercept_)
    print('Coefficient: ', regr.coef_[0])
    print('Variance score (R2): %.2f' % r2_score(disease_progress, disease_progress_pred))
    print()

Feature:  age
Intercept:  [152.13348416]
Coefficient:  [304.18307453]
Variance score (R2): 0.04

Feature:  sex
Intercept:  [152.13348416]
Coefficient:  [69.71535568]
Variance score (R2): 0.00

Feature:  bmi
Intercept:  [152.13348416]
Coefficient:  [949.43526038]
Variance score (R2): 0.34

Feature:  bp
Intercept:  [152.13348416]
Coefficient:  [714.7416437]
Variance score (R2): 0.19

Feature:  s1
Intercept:  [152.13348416]
Coefficient:  [343.25445189]
Variance score (R2): 0.04

Feature:  s2
Intercept:  [152.13348416]
Coefficient:  [281.78459335]
Variance score (R2): 0.03

Feature:  s3
Intercept:  [152.13348416]
Coefficient:  [-639.14527932]
Variance score (R2): 0.16

Feature:  s4
Intercept:  [152.13348416]
Coefficient:  [696.88303009]
Variance score (R2): 0.19

Feature:  s5
Intercept:  [152.13348416]
Coefficient:  [916.13872282]
Variance score (R2): 0.32

Feature:  s6
Intercept:  [152.13348416]
Coefficient:  [619.22282068]
Variance score (R2): 0.15

#import pandas as pd
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df['Disease progression'] = diabetes.target

df.head()

regr = linear_model.LinearRegression()
regr.fit(df[['age', 'bmi']], df[['Disease progression']])
disease_progress_pred = regr.predict(df[['age', 'bmi']])

print('Intercept: ', regr.intercept_)
print('Coefficient: ', regr.coef_[0])
print('Variance score (R2): %.2f' % r2_score(df[['Disease progression']], disease_progress_pred))

Intercept:  [152.13348416]
Coefficient:  [133.01372901 924.81645876]
Variance score (R2): 0.35

progression_male = np.array(df['Disease progression'][df['sex'] > 0])
progression_female = np.array(df['Disease progression'][df['sex'] < 0])

progression_male.std()

78.26358288372121

progression_female.std()

75.7441070979943

#from scipy import stats
stats.ttest_ind(progression_male, progression_female, equal_var=True)

Ttest_indResult(statistic=0.9041147550244715, pvalue=0.3664292946519826)

	age	sex	bmi	bp	s1	s2	s3	s4	s5	s6	Disease progression
0	0.038076	0.050680	0.061696	0.021872	-0.044223	-0.034821	-0.043401	-0.002592	0.019908	-0.017646	151.0
1	-0.001882	-0.044642	-0.051474	-0.026328	-0.008449	-0.019163	0.074412	-0.039493	-0.068330	-0.092204	75.0
2	0.085299	0.050680	0.044451	-0.005671	-0.045599	-0.034194	-0.032356	-0.002592	0.002864	-0.025930	141.0
3	-0.089063	-0.044642	-0.011595	-0.036656	0.012191	0.024991	-0.036038	0.034309	0.022692	-0.009362	206.0
4	0.005383	-0.044642	-0.036385	0.021872	0.003935	0.015596	0.008142	-0.002592	-0.031991	-0.046641	135.0

Source Information¶

Goal¶

Regression¶

Required Modules for the Jupyter Notebook¶

Loading the Diabetes Dataset¶

Exploring the Dataset¶

Preparing Data for Regression¶

Creating a Regression Model¶

Visualizing Results¶

Multiple linear regression¶

Combining sklearn with pandas¶

Build the multiple linear regression model using all independent variables¶

Build the multiple linear regression model using age, sex and bmi¶

Build the ordinary linear regression model using bmi - set aside data for testing¶

Perform ordinary linear regression, looping over all features¶

Using column of data frame as target¶

Working with categorical variables¶

Submit Ticket¶