Files
task_fraud_detection_bolade/experiments/model_training.ipynb
T
boladeE 50e95445fb First commit
Defined file structure and completed EDA
2025-04-24 23:39:36 +01:00

216 lines
16 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Class distribution in training set:\n",
"is_fraud\n",
"0 902418\n",
"1 5254\n",
"Name: count, dtype: int64\n",
"\n",
"Class distribution in test set:\n",
"is_fraud\n",
"0 386751\n",
"1 2252\n",
"Name: count, dtype: int64\n",
"📊 Evaluating Baseline Models:\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1408: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" y = column_or_1d(y, warn=True)\n"
]
},
{
"ename": "ValueError",
"evalue": "Found input variables with inconsistent numbers of samples: [907658, 907672]",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 80\u001b[39m\n\u001b[32m 78\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m📊 Evaluating Baseline Models:\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 79\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m model \u001b[38;5;129;01min\u001b[39;00m models:\n\u001b[32m---> \u001b[39m\u001b[32m80\u001b[39m \u001b[43mevaluate_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_test\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 82\u001b[39m \u001b[38;5;66;03m# ⚖️ SMOTE Experiment\u001b[39;00m\n\u001b[32m 83\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m📈 Experiment with SMOTE for class imbalance:\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mevaluate_model\u001b[39m\u001b[34m(model, X_train, X_test, y_train, y_test)\u001b[39m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mevaluate_model\u001b[39m(model, X_train, X_test, y_train, y_test):\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 40\u001b[39m y_pred = model.predict(X_test)\n\u001b[32m 41\u001b[39m y_prob = model.predict_proba(X_test)[:, \u001b[32m1\u001b[39m]\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\base.py:1389\u001b[39m, in \u001b[36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(estimator, *args, **kwargs)\u001b[39m\n\u001b[32m 1382\u001b[39m estimator._validate_params()\n\u001b[32m 1384\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[32m 1385\u001b[39m skip_parameter_validation=(\n\u001b[32m 1386\u001b[39m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[32m 1387\u001b[39m )\n\u001b[32m 1388\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1389\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1222\u001b[39m, in \u001b[36mLogisticRegression.fit\u001b[39m\u001b[34m(self, X, y, sample_weight)\u001b[39m\n\u001b[32m 1219\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1220\u001b[39m _dtype = [np.float64, np.float32]\n\u001b[32m-> \u001b[39m\u001b[32m1222\u001b[39m X, y = \u001b[43mvalidate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1223\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 1224\u001b[39m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1225\u001b[39m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1226\u001b[39m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcsr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 1227\u001b[39m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43m_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1228\u001b[39m \u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mC\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 1229\u001b[39m \u001b[43m \u001b[49m\u001b[43maccept_large_sparse\u001b[49m\u001b[43m=\u001b[49m\u001b[43msolver\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mliblinear\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msaga\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1230\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1231\u001b[39m check_classification_targets(y)\n\u001b[32m 1232\u001b[39m \u001b[38;5;28mself\u001b[39m.classes_ = np.unique(y)\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:2961\u001b[39m, in \u001b[36mvalidate_data\u001b[39m\u001b[34m(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)\u001b[39m\n\u001b[32m 2959\u001b[39m y = check_array(y, input_name=\u001b[33m\"\u001b[39m\u001b[33my\u001b[39m\u001b[33m\"\u001b[39m, **check_y_params)\n\u001b[32m 2960\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m2961\u001b[39m X, y = \u001b[43mcheck_X_y\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mcheck_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2962\u001b[39m out = X, y\n\u001b[32m 2964\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m check_params.get(\u001b[33m\"\u001b[39m\u001b[33mensure_2d\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m):\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1389\u001b[39m, in \u001b[36mcheck_X_y\u001b[39m\u001b[34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[39m\n\u001b[32m 1370\u001b[39m X = check_array(\n\u001b[32m 1371\u001b[39m X,\n\u001b[32m 1372\u001b[39m accept_sparse=accept_sparse,\n\u001b[32m (...)\u001b[39m\u001b[32m 1384\u001b[39m input_name=\u001b[33m\"\u001b[39m\u001b[33mX\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 1385\u001b[39m )\n\u001b[32m 1387\u001b[39m y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)\n\u001b[32m-> \u001b[39m\u001b[32m1389\u001b[39m \u001b[43mcheck_consistent_length\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1391\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m X, y\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:475\u001b[39m, in \u001b[36mcheck_consistent_length\u001b[39m\u001b[34m(*arrays)\u001b[39m\n\u001b[32m 473\u001b[39m uniques = np.unique(lengths)\n\u001b[32m 474\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(uniques) > \u001b[32m1\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m475\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 476\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mFound input variables with inconsistent numbers of samples: \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 477\u001b[39m % [\u001b[38;5;28mint\u001b[39m(l) \u001b[38;5;28;01mfor\u001b[39;00m l \u001b[38;5;129;01min\u001b[39;00m lengths]\n\u001b[32m 478\u001b[39m )\n",
"\u001b[31mValueError\u001b[39m: Found input variables with inconsistent numbers of samples: [907658, 907672]"
]
}
],
"source": [
"# model_training_experiment.ipynb\n",
"\n",
"# 📦 Import libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import (\n",
" accuracy_score, precision_score, recall_score, \n",
" f1_score, roc_auc_score, confusion_matrix, \n",
" classification_report, roc_curve\n",
")\n",
"\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"from xgboost import XGBClassifier\n",
"\n",
"from imblearn.over_sampling import SMOTE\n",
"from imblearn.pipeline import Pipeline as ImbPipeline\n",
"import joblib\n",
"\n",
"# 📂 Load processed data\n",
"X_train = pd.read_csv('X_train.csv')\n",
"X_test = pd.read_csv('X_test.csv')\n",
"y_train = pd.read_csv('y_train.csv')\n",
"y_test = pd.read_csv('y_test.csv')\n",
"\n",
"# 🧪 Check class distribution\n",
"print(\"Class distribution in training set:\")\n",
"print(y_train.value_counts())\n",
"print(\"\\nClass distribution in test set:\")\n",
"print(y_test.value_counts())\n",
"\n",
"# ⚙️ Evaluation Function\n",
"def evaluate_model(model, X_train, X_test, y_train, y_test):\n",
" model.fit(X_train, y_train)\n",
" y_pred = model.predict(X_test)\n",
" y_prob = model.predict_proba(X_test)[:, 1]\n",
"\n",
" print(f\"\\n🔍 Model: {model.__class__.__name__}\")\n",
" print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
" print(\"Precision:\", precision_score(y_test, y_pred))\n",
" print(\"Recall:\", recall_score(y_test, y_pred))\n",
" print(\"F1 Score:\", f1_score(y_test, y_pred))\n",
" print(\"ROC AUC:\", roc_auc_score(y_test, y_prob))\n",
"\n",
" # Confusion Matrix\n",
" cm = confusion_matrix(y_test, y_pred)\n",
" sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\n",
" plt.title('Confusion Matrix')\n",
" plt.xlabel('Predicted')\n",
" plt.ylabel('Actual')\n",
" plt.show()\n",
"\n",
" # ROC Curve\n",
" fpr, tpr, _ = roc_curve(y_test, y_prob)\n",
" plt.plot(fpr, tpr, label=\"ROC Curve\")\n",
" plt.plot([0, 1], [0, 1], 'k--')\n",
" plt.xlabel('False Positive Rate')\n",
" plt.ylabel('True Positive Rate')\n",
" plt.title('ROC Curve')\n",
" plt.legend()\n",
" plt.show()\n",
" \n",
" return model\n",
"\n",
"# ⚗️ Baseline Models\n",
"models = [\n",
" LogisticRegression(max_iter=1000, random_state=42),\n",
" RandomForestClassifier(random_state=42),\n",
" GradientBoostingClassifier(random_state=42),\n",
" XGBClassifier(random_state=42, eval_metric='logloss')\n",
"]\n",
"\n",
"print(\"📊 Evaluating Baseline Models:\")\n",
"for model in models:\n",
" evaluate_model(model, X_train, X_test, y_train, y_test)\n",
"\n",
"# ⚖️ SMOTE Experiment\n",
"print(\"\\n📈 Experiment with SMOTE for class imbalance:\")\n",
"smote_pipeline = ImbPipeline([\n",
" ('smote', SMOTE(random_state=42)),\n",
" ('model', LogisticRegression(max_iter=1000, random_state=42))\n",
"])\n",
"evaluate_model(smote_pipeline, X_train, X_test, y_train, y_test)\n",
"\n",
"# 🔍 Hyperparameter Tuning (XGBoost)\n",
"print(\"\\n🔧 Hyperparameter tuning for XGBoost:\")\n",
"param_grid = {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__max_depth': [3, 5, 7],\n",
" 'model__learning_rate': [0.01, 0.1],\n",
" 'model__subsample': [0.8, 1.0],\n",
" 'model__colsample_bytree': [0.8, 1.0]\n",
"}\n",
"\n",
"grid_pipeline = ImbPipeline([\n",
" ('smote', SMOTE(random_state=42)),\n",
" ('model', XGBClassifier(random_state=42, eval_metric='logloss'))\n",
"])\n",
"\n",
"cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)\n",
"grid_search = GridSearchCV(grid_pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Best parameters:\", grid_search.best_params_)\n",
"print(\"Best ROC AUC from CV:\", grid_search.best_score_)\n",
"\n",
"# 🏆 Evaluate Best Model\n",
"best_model = grid_search.best_estimator_\n",
"evaluate_model(best_model, X_train, X_test, y_train, y_test)\n",
"\n",
"# 🌟 Feature Importance\n",
"model_step = best_model.named_steps['model']\n",
"if hasattr(model_step, 'feature_importances_'):\n",
" importances = model_step.feature_importances_\n",
" features = X_train.columns\n",
" feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})\n",
" feature_importance = feature_importance.sort_values('Importance', ascending=False)\n",
"\n",
" plt.figure(figsize=(12, 8))\n",
" sns.barplot(x='Importance', y='Feature', data=feature_importance)\n",
" plt.title('Feature Importance')\n",
" plt.show()\n",
"\n",
"# 💾 Save Best Model\n",
"joblib.dump(best_model, 'best_fraud_detection_model.pkl')\n",
"print(\"✅ Best model saved as 'best_fraud_detection_model.pkl'\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}