In [ ]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
In [ ]:
#Getting the dataset
housing_data = pd.read_csv('/content/drive/MyDrive/11th Grade/Advanced Topics Comp. Sci./Machine Learning/Data/BostonHousing.csv')
In [ ]:
housing_data.head()
Out[ ]:
| crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | b | lstat | medv | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
| 1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
| 2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
| 3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
| 4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
In [ ]:
#Data preprocessing
# Check for missing values
print("Missing values in each column:")
print(housing_data.isnull().sum())
print()
Missing values in each column: crim 0 zn 0 indus 0 chas 0 nox 0 rm 0 age 0 dis 0 rad 0 tax 0 ptratio 0 b 0 lstat 0 medv 0 dtype: int64
In [ ]:
#Assign feature and label vectors
X = housing_data.drop("medv", axis = "columns")
y = housing_data['medv']
In [ ]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [ ]:
#Create and fit the model
model = LinearRegression()
model.fit(X_train, y_train)
Out[ ]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [ ]:
#Create predictions
y_pred = model.predict(X_test)
In [ ]:
#Model evaluations
print('Coefficients', model.coef_)
print(X_train.columns)
print('Intercept', model.intercept_)
print('MSE: %.2f' % mean_squared_error(y_test, y_pred))
print('R^2: %.2f' % r2_score(y_test, y_pred))
Coefficients [-1.06372562e-01 4.60727522e-02 2.77431050e-02 4.42728626e+00
-1.39081525e+01 3.72044846e+00 -2.50053593e-03 -1.24722721e+00
2.92025350e-01 -1.23478432e-02 -8.80824406e-01 6.53167503e-03
-5.45683176e-01]
Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
'ptratio', 'b', 'lstat'],
dtype='object')
Intercept 33.97648697262077
MSE: 27.70
R^2: 0.70
In [ ]:
#Create a scatter plot of the predicted values vs. actual values
plt.scatter(y_test, y_pred, alpha = 0.8, marker = "^", color = "red")
plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.title("Actual vs Predicited Values")
plt.show()