import pandas
import sklearn
import numpy as np
import pandas as pd
from sklearn.cross_validation import KFold
from sklearn.neighbors import KNeighborsClassifier
from time import time as t
from pandas import DataFrame
import sys
sys.path.append("..")
data = pd.read_csv('D:/wine.csv')
def crossVal(estimator, X, y, cv):
result_accracy_list = []
for train_index, test_index in cv:
estimator.fit(X.iloc[train_index],
y.iloc[train_index])
rpesult_accracy_list.apend(np.average(np.array(estimator.predict(X.iloc[test_index]) == np.array(y.iloc[test_index]))))
return pd.Series(result_accracy_list)
from sklearn.cross_validation import KFold #классификатор
kfold = KFold(n = len(data), #кол-во строк
n_folds = 5, #кроссвалидации по 5 блокам
shuffle = True, #перемешивает выборку перед формированием блоков
random_state = 42) #генератор KFold с фиксированным параметром
from sklearn.neighbors import KNeighborsClassifier #один из алгоритмов классификации
from sklearn.cross_validation import cross_val_score
accuracies_dict = {}
for k in range(1, 50 + 1):
classifier = KNeighborsClassifier(n_neighbors = k)
#scores = cross_val_score(classifier, data.drop(['Class'], axis = 1), data['Class'], scoring = 'accuracy', cv = kfold)
scores = crossVal(estimator = classifier,
X = data.drop(['Class'], axis = 1),
y = data['Class'],
cv = kfold)
accuracies_dict[k] = scores.mean()
optimal_k = sorted(accuracies_dict, key = accuracies_dict.get, reverse = True)[0]
print('optimal k: (', optimal_k, ', %.4f' %accuracies_dict[k], ')')
from sklearn.preprocessing import scale
scaled_data = scale(data.drop(['Class'], axis = 1))
scaled_accuracies_dict = {}
for k in range(1, 50 + 1):
classifier = KNeighborsClassifier(n_neighbors=k)
#scores = cross_val_score(classifier, scaled_data, data['Class'], scoring = 'accuracy', cv = kfold)
scores = crossVal(estimator = classifier, X = pd.DataFrame(scaled_data), y = data['Class'], cv = kfold)
scaled_accuracies_dict[k] = scores.mean()
optimal_k = sorted(scaled_accuracies_dict, key = scaled_accuracies_dict.get, reverse = True)[0]
print('optimal k ater scaling: (', optimal_k, ', %.4f' %scaled_accuracies_dict[k], ')')#после масштабирования
webowl