This a REGRESSION problem. Ten numeric predictive variables: age, sex, body mass index, average blood pressure, and six blood serum measurements. They were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure (integer between 25 and 346) of disease progression one year after baseline.
Note: Each of the 10 feature variables have been mean centered and scaled by the standard deviation times n_samples (i.e. the sum of squares of each column totals 1).
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
# Print dataset description
print(diabetes.DESCR)
# Input vectors
diabetes_X = diabetes.data
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)
# The coefficients
print('Coefficients: \n', regr.coef_)
# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)
# The mean squared error on test set
print("Mean squared error (on test set): %.2f"
% mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score (max_value=1 for perfect prediction): %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))