for column in log10_cols:
X[column] = np.log10(X[column])
for column in log_cols:
X[column] = np.log(X[column])
numeric_transformer = Pipeline(
steps=[("scaler", StandardScaler()),]
)
categorical_transformer = Pipeline(
steps=[
("encoder", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
clf = Pipeline(
steps=[ ("preprocessor", preprocessor),
("classifier", KNeighborsClassifier())
]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)
print("model score for training set: %.3f" % clf.score(X_train, y_train))
print("model score for testing set: %.3f" % clf.score(X_test, y_test))
categorical_cols = ['Type', 'Network',
'5000','4500','4000','3500','3000','2500','2000','1500',
'1000','900','800','700','600','500','400','300','200','100' ]
for column in categorical_cols:
X_train[column] = pd.Categorical(X_train[column])
X_train = pd.get_dummies(X_train, columns=[column])
all_numeric = log_cols+log10_cols+numerical_cols
X_train[all_numeric]=standard_scaler.fit_transform(X_train[all_numeric])
X_test[all_numeric]=standard_scaler.fit_transform(X_test[all_numeric])
X = df.drop(columns=['Status'])
y = df['Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
minMax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()
categorical_cols = ['',...]
log10_cols = ['',...]
log_cols = ['',...]
numerical_cols = ['',...]
for column in categorical_cols:
X_train[column] = pd.Categorical(X_train[column])
X_train = pd.get_dummies(X_train, columns=[column])
for column in log10_cols:
X_train[column] = np.log10(X_train[column])
for column in log_cols:
X_train[column] = np.log(X_train[column])
X_train[numerical_cols]=minMax_scaler.fit_transform(X_train[numerical_cols])
for column in categorical_cols:
X_test[column] = pd.Categorical(X_test[column])
X_test = pd.get_dummies(X_test, columns=[column])
for column in log10_cols:
X_test[column] = np.log10(X_test[column])
for column in log_cols:
X_test[column] = np.log(X_test[column])
X_test[numerical_cols]=minMax_scaler.transform(X_test[numerical_cols])
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_train_preds = rf_model.predict(X_train)
rf_train_acc = accuracy_score(y_train, rf_train_preds)
rf_test_preds = rf_model.predict(X_test)
rf_test_acc = accuracy_score(y_test, rf_test_preds)
print(f'Random Forest training accuracy: {rf_train_acc:.4f}')
print(f'Random Forest test accuracy: {rf_test_acc:.4f}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
list = [n for n in range(1, 30)]
list2 = [n for n in range(1, 30)]
grid_params = { 'n_neighbors' : list2,
'weights' : ['uniform','distance'],
'metric' : ['minkowski','euclidean','manhattan'],
'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
'leaf_size':list}
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
g_res = gs.fit(X_train, y_train)
print(g_res.best_score_, g_res.best_params_)
knn = KNeighborsClassifier(**gs.best_params_)
knn.fit(X_train, y_train)
y_hat = knn.predict(X_train)
y_knn = knn.predict(X_test)
print('Training set accuracy: ', metrics.accuracy_score(y_train, y_hat))
print('Test set accuracy: ',metrics.accuracy_score(y_test, y_knn))
scores = cross_val_score(knn, X, y, cv =5)
print('Model accuracy: ',np.mean(scores))