contoh KNN regressor dengan dataset employee.csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
mengembalikan data ke tipedata object kemudian dirubah menjadi numerical
# Load dataset
data = pd.read_csv('employee_train.csv')
########################################################################################
# Data preprocessing
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
# Convert back to original data types
for col in data.columns:
if data[col].dtype == 'object':
data_imputed[col] = data_imputed[col].astype('object')
else:
data_imputed[col] = pd.to_numeric(data_imputed[col])
############################################################################################
label encoder dengan pengecualian target variable
# Encode categorical variables
label_encoders = {}
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
if col != 'Attrition': # We'll keep Attrition as is for potential classification later
le = LabelEncoder()
data_imputed[col] = le.fit_transform(data_imputed[col].astype(str))
label_encoders[col] = le
split data dan standardisasi
# Define features and target
X = data_imputed.drop(['MonthlyIncome', 'ID', 'Attrition'], axis=1) # Exclude ID and Attrition
y = data_imputed['MonthlyIncome']
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
mengecek data hasil convert tipe data
data_imputed.info()
memuat model KNN regressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Create and train KNN Regressor
knn_reg = KNeighborsRegressor(n_neighbors=5) # You can tune this parameter
knn_reg.fit(X_train_scaled, y_train)
prediksi dan evaluasi
# Predictions
y_pred = knn_reg.predict(X_test_scaled)
# Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Model Evaluation:")
print(f"RMSE: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")