import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Read the data
#crimesDF =pd.read_csv("crimes.csv",encoding="ISO-8859-1")
crimesDF =pd.read_csv("communities.csv",encoding="ISO-8859-1")
#Remove the 1st 7 columns
print(crimesDF.shape[1]) #128
crimesDF1=crimesDF.iloc[:,6:crimesDF.shape[1]]
# Convert to numeric
crimesDF2 = crimesDF1.apply(pd.to_numeric, errors='coerce')
# Impute NA to 0s
crimesDF2.fillna(0, inplace=True)
# Select the X (feature vatiables - all)
X=crimesDF2.iloc[:,0:120]
# Set the target
y=crimesDF2.iloc[:,121]
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 0)
# Fit a multivariate regression model
linreg = LinearRegression().fit(X_train, y_train)
# compute and print the R Square
print('R-squared score (training): {:.3f}'.format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(linreg.score(X_test, y_test)))
## R-squared score (training): 0.78
## R-squared score (test): 0.03
test的score只有0.03 不知道是什么原因呢