import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
train_df = train_data
test_df = test_data
X_train = train_df.drop('ack', axis=1)
y_train = train_df['ack']
X_test = test_df.drop('ack', axis=1)
y_test = test_df['ack']
def oneR(X_train, y_train):
accuracies = []
for col in X_train.columns:
values = X_train[col].unique()
max_acc = 0
best_threshold = None
best_class = None
for value in values:
y_pred = (X_train[col] > value).astype(int)
acc = accuracy_score(y_train, y_pred)
if acc > max_acc:
max_acc = acc
best_threshold = value
best_class = 1 if value < X_train[col].mean() else 0
accuracies.append(max_acc)
best_feature = X_train.columns[accuracies.index(max(accuracies))]
best_threshold = sorted(X_train[best_feature].unique())[1]
best_class = 1 if best_threshold < X_train[best_feature].mean() else 0
print("Best feature:", best_feature)
print("Threshold:", best_threshold)
print("Class below threshold:", best_class)
return best_feature, best_threshold, best_class
best_feature, best_threshold, best_class = oneR(X_train, y_train)
def predict_oneR(X):
y_pred = (X[best_feature] > best_threshold).astype(int)
return y_pred.apply(lambda x: 1 if x else 0)
y_pred = predict_oneR(X_test)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
dt = DecisionTreeClassifier()
lr = LogisticRegression()
models = [('OneR', predict_oneR),
('Decision Tree', dt),
('Logistic Regression', lr)]
for name, model in models:
if name == 'OneR':
model = predict_oneR
else:
model.fit(X_train, y_train)
if name == 'Decision Tree':
from sklearn import tree
tree.plot_tree(model)
if name == 'OneR':
y_pred = model(X_test)
else:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
print(name)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
Делал OneR алгоритм, но в сравнении с tree класификатором выдает лучшие параметрыю. Только начинаю с изучением машинного обучения, поэтому хотел бы узнать есть ли ошибка у меня в коде выше. Или что то в датасете.
Output:
Best feature: scroll_move_total_rel_distance
Threshold: -0.9427631178094024
Class below threshold: 1
OneR
Accuracy: 0.5497709287796751
Precision: 0.5497709287796751
Recall: 1.0
Decision Tree
Accuracy: 0.45022907122032485
Precision: 0.0
Recall: 0.0
Logistic Regression
Accuracy: 0.5497709287796751
Precision: 0.5497709287796751
Recall: 1.0