此題我用三個方法去做,分別是perceptron、Logistic Regression、KNN,分別用三個方法去分析哪個的accurancy較為準確。
################################Perceptron################################
import pandas as pd
car_data = pd.read_csv('car_data.csv')
car_data.columns = ['one','two','three','four','five','six','seven']
one_mapping = {'vhigh':1,'high':2,'med':3,'low':4}
two_mapping = {'vhigh':1,'high':2,'med':3,'low':4}
three_mapping = {'2':1,'3':2,'4':3,'5more':4}
four_mapping = {'2':1,'4':2,'more':3}
five_mapping = {'small':1,'med':2,'big':3}
six_mapping = {'low':1,'med':2,'high':3}
seven_mapping = {'unacc':1, 'acc':1, 'good':-1, 'vgood':-1}
car_data['one'] = car_data['one'].map(one_mapping)
car_data['two'] = car_data['two'].map(two_mapping)
car_data['three'] = car_data['three'].map(three_mapping)
car_data['four'] = car_data['four'].map(four_mapping)
car_data['five'] = car_data['five'].map(five_mapping)
car_data['six'] = car_data['six'].map(six_mapping)
car_data['seven'] = car_data['seven'].map(seven_mapping)
car_value = car_data.values
X = car_value[:,0:6]
Y = car_value[:,6]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1, stratify=Y)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)
n_car=0
from sklearn.linear_model import Perceptron
ppn = Perceptron(None, eta0=0.1, random_state=0)
ppn.fit(x_train_std,y_train)
y_pred = ppn.predict(x_test_std)
print('Misclassified samples: %d' %(y_test != y_pred).sum())
from sklearn.metrics import accuracy_score
print('Accuracy: %f' %accuracy_score(y_test, y_pred))
#print('Accuracy: %f' %ppn.score(x_test_std, y_test)) 另外一種accurancy打法
第一種方式我選擇用perceptron去做,首先先將dataset利用mapping的方式變成矩陣,先把前面的6個feature存到X裡面,而第七列label則存到Y中,因為題目的要求只需要分成兩類,我把unacc和acc設為1,good和vgood則設為-1。之後設定test_size為0.3,stratify=Y讓不管是train或test中各類數據的比例與原數據集一樣。再來就是做preprocessing的部分,sc.fit(x_train)這行是先算出x_train裡面的平均值和標準差,之後再經過標準化過後得到x_train_std以及y_test_std。之後再引入perceptron函數,把學習速率,在n_iter選代數的部分設none,eta0學習速率設0.1,random_state的部分則是在每次選代數時都要打亂一次,最後最後即可以預測出y_pre,最後比較y_test和y_pred即可得到此題的accuracy=0.947977。
################################Logistic Regression################################
# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
#preprocessing
car_data = pd.read_csv('car_data.csv')
car_data.columns = ['one','two','three','four','five','six','seven']
one_mapping = {'vhigh':1,'high':2,'med':3,'low':4}
two_mapping = {'vhigh':1,'high':2,'med':3,'low':4}
three_mapping = {'2':1,'3':2,'4':3,'5more':4}
four_mapping = {'2':1,'4':2,'more':3}
five_mapping = {'small':1,'med':2,'big':3}
six_mapping = {'low':1,'med':2,'high':3}
seven_mapping = {'unacc':1, 'acc':1, 'good':-1, 'vgood':-1}
car_data['one'] = car_data['one'].map(one_mapping)
car_data['two'] = car_data['two'].map(two_mapping)
car_data['three'] = car_data['three'].map(three_mapping)
car_data['four'] = car_data['four'].map(four_mapping)
car_data['five'] = car_data['five'].map(five_mapping)
car_data['six'] = car_data['six'].map(six_mapping)
car_data['seven'] = car_data['seven'].map(seven_mapping)
car_data.tail()
car_value = car_data.values
X = car_value[:,0:6]
Y = car_value[:,6]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1, stratify=Y)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sn
logistic_regression= LogisticRegression()
logistic_regression.fit(x_train_std, y_train)
y_pred=logistic_regression.predict(x_test_std)
print('Misclassified samples: %d' %(y_test != y_pred).sum())
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)
print('Accuracy: ',metrics.accuracy_score(y_test, y_pred))
第二種方式我選擇用Logistic Regression的方式去做,前面的mapping和preprocessing的方式都和上面相同,最後再將x_train_std和y_train做fit,然後再將x_test_std去做predict即可得到accurancy=0.9595375722543352,且分錯的數量是21筆,和perceptron比較起來較為準確,這也是能預見的結果,畢竟數據太多且若不是呈現線性分布,perceptron的準確率就會比較低。這題我有加入混淆矩陣,去分析整個test數據,我們可以得知右上和左下的相加結果就是miscalssified samples的數量。
#####KNN#####
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from itertools import combinations
import numpy as np
from sklearn.metrics import accuracy_score
# Sequential Backward Selection
class SBS():
def __init__(self, estimator, k_features, scoring=accuracy_score,
test_size=0.25, random_state=1):
self.scoring = scoring
self.estimator = clone(estimator)
self.k_features = k_features
self.test_size = test_size
self.random_state = random_state
def fit(self, X, y):
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=self.test_size,
random_state=self.random_state)
dim = X_train.shape[1]
self.indices_ = tuple(range(dim))
self.subsets_ = [self.indices_]
score = self._calc_score(X_train, y_train,
X_test, y_test, self.indices_)
self.scores_ = [score]
while dim > self.k_features:
scores = []
subsets = []
for p in combinations(self.indices_, r=dim - 1):
score = self._calc_score(X_train, y_train,
X_test, y_test, p)
scores.append(score)
subsets.append(p)
best = np.argmax(scores)
self.indices_ = subsets[best]
self.subsets_.append(self.indices_)
dim -= 1
self.scores_.append(scores[best])
self.k_score_ = self.scores_[-1]
return self
def transform(self, X):
return X[:, self.indices_]
def _calc_score(self, X_train, y_train, X_test, y_test, indices):
self.estimator.fit(X_train[:, indices], y_train)
y_pred = self.estimator.predict(X_test[:, indices])
score = self.scoring(y_test, y_pred)
return score
# Test the SBS
#preprocessing
car_data = pd.read_csv('car_data.csv')
car_data.columns = ['one','two','three','four','five','six','seven']
one_mapping = {'vhigh':1,'high':2,'med':3,'low':4}
two_mapping = {'vhigh':1,'high':2,'med':3,'low':4}
three_mapping = {'2':1,'3':2,'4':3,'5more':4}
four_mapping = {'2':1,'4':2,'more':3}
five_mapping = {'small':1,'med':2,'big':3}
six_mapping = {'low':1,'med':2,'high':3}
seven_mapping = {'unacc':1, 'acc':1, 'good':-1, 'vgood':-1}
car_data['one'] = car_data['one'].map(one_mapping)
car_data['two'] = car_data['two'].map(two_mapping)
car_data['three'] = car_data['three'].map(three_mapping)
car_data['four'] = car_data['four'].map(four_mapping)
car_data['five'] = car_data['five'].map(five_mapping)
car_data['six'] = car_data['six'].map(six_mapping)
car_data['seven'] = car_data['seven'].map(seven_mapping)
car_data.tail()
car_value = car_data.values
X = car_value[:,0:6]
y = car_value[:,6]
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.3, random_state=0)
# Feature scaling - computes mean and standard deviation
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=5)
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)
knn.fit(X_train_std, y_train)
y_pred = knn.predict(X_test_std)
print('Misclassified samples: %d' %(y_test != y_pred).sum())
print('Test accuracy:', knn.score(X_test_std, y_test))
此題我用knn演算法去做,前面的mapping和preprocessing的方式都和上面相同,最後再將x_train_std和y_train做fit,然後再將x_test_std去做predict即可得到accurancy=0.9845857418111753,且分錯的數量是8筆,明顯優於上面兩種演算法。由於Logistic Regression是用二元分類(Binary classification)法,而knn則是用多元分類(Multiclass classification),故knn準確率更高。