# read in data and mapping
import pandas as pd
import numpy as np
file = pd.read_csv('fashion-mnist_train.csv', header = None)
X_train = file.iloc[1:, 1:785].astype(np.int)
y_train = file.iloc[1:, 0].astype(np.int)
file = pd.read_csv('fashion-mnist_test.csv', header = None)
X_test = file.iloc[1:, 1:785].astype(np.int)
y_test = file.iloc[1:, 0].astype(np.int)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import validation_curve
param_range = [10, 20, 30, 50, 80, 100]
train_scores, test_scores = validation_curve(estimator=RandomForestClassifier(criterion='gini', random_state=1, n_jobs=3),
X=X_train,
y=y_train,
param_range=param_range,
param_name='n_estimators',
cv=5)
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.legend(loc='lower right')
plt.grid()
plt.ylim([0.7, 1.025])
plt.tight_layout()
plt.show()
import time
forest_orig_data = RandomForestClassifier(criterion='gini', n_estimators=50, random_state=1, n_jobs=3)
start = time.time()
forest_orig_data.fit(X_train, y_train)
score_fod = forest_orig_data.score(X_test, y_test)
stop = time.time()
print('Test Accuracy: %.4f.\n\nProcessing Time: %.2fs.' % (score_fod, (stop-start)))
# standardization
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std=sc.transform(X_test)
將X_train標準化,並同時transform X_test
# data compression: LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=8)
X_train_lda = lda.fit_transform(X_train_std, y_train)
X_test_lda = lda.transform(X_test_std)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
param_C = [0.01, 0.1, 1.0, 10.0, 100.0]
lr_solver = ['newton-cg', 'sag', 'saga', 'lbfgs']
param_grid = {'C': param_C,
'solver': lr_solver}
gs = GridSearchCV(estimator=LogisticRegression(penalty='l2', multi_class='auto', max_iter=200, random_state=1),
param_grid=param_grid,
scoring='accuracy',
cv=10,
n_jobs=-1)
gs = gs.fit(X_train_lda, y_train)
print('Best validation score: %.4f' % gs.best_score_, '\nBest parameter combination:', gs.best_params_)
lr = gs.best_estimator_
lr.fit(X_train_lda, y_train)
print('Test accuracy:', lr.score(X_test_lda, y_test))
import time
start = time.time()
lr.fit(X_train_lda, y_train)
lr.score(X_test_lda, y_test)
stop = time.time()
print('Process Time: %.2fs.' % (stop-start))
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(estimator=lr,
X=X_train_lda,
y=y_train,
train_sizes=np.linspace(0.1,1.0,10),
cv=5,
n_jobs=1)
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()
from sklearn.svm import SVC
param_gamma = [0.1, 1.0, 10.0, 'scale'] # scale uses 1 / (n_features * X.var()) as value of gamma
param_C = [0.1, 1.0, 10.0]
param_grid_svc = [{'gamma': param_gamma,
'C': param_C,
'kernel': ['rbf']}]
gs_svc = GridSearchCV(estimator=SVC(random_state=1),
param_grid=param_grid_svc,
scoring='accuracy',
cv=5,
n_jobs=-1)
gs_svc = gs_svc.fit(X_train_lda, y_train)
print('Best validation score: %.4f' % gs_svc.best_score_)
print('Best parameter combination:', gs_svc.best_params_,)
svm = SVC(C=1.0, gamma=0.1, kernel='rbf', random_state=1)
start = time.time()
svm.fit(X_train_lda, y_train)
score_svm = svm.score(X_test_lda, y_test)
stop = time.time()
print('Test Accuracy: %.4f' % score_svm)
print('Process Time: %.2fs.' % (stop-start))
from sklearn.ensemble import RandomForestClassifier
param_n = [50, 100, 150, 200, 300, 500]
param_grid_forest = {'n_estimators': param_n}
gs_forest = GridSearchCV(estimator=RandomForestClassifier(criterion='gini', random_state=1),
param_grid=param_grid_forest,
scoring='accuracy',
cv=5,
n_jobs=-1)
gs_forest = gs_forest.fit(X_train_lda, y_train)
print('Best n:', gs_forest.best_params_['n_estimators'])
print('Best validation score: %.4f' % gs_forest.best_score_)
forest = RandomForestClassifier(criterion='gini', n_estimators=200, random_state=1, n_jobs=2)
start = time.time()
forest.fit(X_train_lda, y_train)
score_forest = forest.score(X_test_lda, y_test)
stop = time.time()
print('Test Accuracy: %.4f' % score_forest)
print('\nProcess Time: %.2fs.' % (stop-start))
train_sizes, train_scores, test_scores = learning_curve(estimator=forest,
X=X_train_lda,
y=y_train,
train_sizes=np.linspace(0.1,1.0,10),
cv=5,
n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.legend(loc='lower right')
plt.ylim([0.7, 1.025])
plt.grid()
plt.tight_layout()
plt.show()
forest2 = RandomForestClassifier(criterion='gini', n_estimators=200, max_features=None, random_state=1, n_jobs=2)
train_sizes, train_scores, test_scores = learning_curve(estimator=forest,
X=X_train_lda,
y=y_train,
train_sizes=np.linspace(0.1,1.0,10),
cv=5,
n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.legend(loc='lower right')
plt.ylim([0.7, 1.025])
plt.grid()
plt.tight_layout()
plt.show()
from sklearn.neighbors import KNeighborsClassifier
param_k = [10, 20, 30, 50, 70, 90]
param_grid_knn = {'n_neighbors': param_k}
gs_knn = GridSearchCV(estimator=KNeighborsClassifier(p=2, metric='minkowski'),
param_grid=param_grid_knn,
scoring='accuracy',
cv=5,
n_jobs=-1)
gs_knn = gs_knn.fit(X_train_lda, y_train)
print('Best n:', gs_knn.best_params_['n_neighbors'])
print('Best validation score: %.4f' % gs_knn.best_score_)
knn = KNeighborsClassifier(n_neighbors=50, p=2, metric='minkowski') # choose k = 12
start = time.time()
knn.fit(X_train_lda, y_train)
score_knn = knn.score(X_test_lda, y_test)
stop = time.time()
print('Test Accuracy: %.4f' % score_knn)
print('\nProcess Time: %.2fs.' % (stop-start))
train_sizes, train_scores, test_scores = learning_curve(estimator=knn,
X=X_train_lda,
y=y_train,
train_sizes=np.linspace(0.1,1.0,10),
cv=5,
n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.legend(loc='lower right')
plt.ylim([0.7, 0.9])
plt.grid()
plt.tight_layout()
plt.show()
from sklearn.ensemble import VotingClassifier
weights = [0.8, 1.0, 1.2, 1.0]
ens_clf = VotingClassifier(estimators=[('lr', lr), ('svm', svm), ('forest', forest), ('knn', knn)], voting='hard', weights=weights)
start = time.time()
ens_clf.fit(X_train_lda, y_train)
score_ens = ens_clf.score(X_test_lda, y_test)
stop = time.time()
print('Accuracy: %.4f\n\nProcess Time %.2fs.' % (score_ens, (stop-start)))