Files
task_fraud_detection_bolade/experiments/model_selection.ipynb
T

300 lines
40 KiB
Plaintext
Raw Normal View History

2025-04-25 17:53:04 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "2e06e8ea",
"metadata": {},
"source": [
"### Preprocessing Pipeline and Model Selection for Fraud Detection\n",
"This notebook sets up preprocessing and compares different models to pick the best one for fraud prediction.\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "bc5f3e5f",
"metadata": {},
"outputs": [],
"source": [
"# Cell 2: Imports and Data Loading\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.metrics import classification_report, roc_auc_score\n",
"\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from xgboost import XGBClassifier\n",
"from lightgbm import LGBMClassifier"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9c62f57d",
"metadata": {},
"outputs": [],
"source": [
"# Load and preprocess\n",
"df = pd.read_csv(\"../data/processed/resampled_data.csv\")\n",
"\n",
"# Dates\n",
"df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])\n",
"df['dob'] = pd.to_datetime(df['dob'])\n",
"\n",
"# Time features\n",
"df['hour'] = df['trans_date_trans_time'].dt.hour\n",
"df['day'] = df['trans_date_trans_time'].dt.day\n",
"df['month'] = df['trans_date_trans_time'].dt.month\n",
"df['weekday'] = df['trans_date_trans_time'].dt.weekday\n",
"df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365\n",
"\n",
"# Drop high-cardinality or unneeded columns\n",
"df = df.drop(columns=['trans_date_trans_time', 'dob', 'trans_num', 'unix_time', 'cc_num', 'street'])\n",
"\n",
"# Encode categorical variables\n",
"cat_cols = ['gender', 'category', 'job', 'merchant', 'first', 'last', 'city', 'state']\n",
"for col in cat_cols:\n",
" df[col] = LabelEncoder().fit_transform(df[col])\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c15ede5e",
"metadata": {},
"outputs": [],
"source": [
"# Use only RFE-selected features\n",
"selected_features = [\n",
" 'merchant', 'category', 'amt', 'last', 'city',\n",
" 'city_pop', 'merch_lat', 'merch_long', 'hour', 'age'\n",
"]\n",
"\n",
"X = df[selected_features]\n",
"y = df['is_fraud']\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b782c387",
"metadata": {},
"outputs": [],
"source": [
"# Preprocessing pipeline\n",
"preprocessor = ColumnTransformer([\n",
" (\"num\", StandardScaler(), selected_features)\n",
"])\n",
"\n",
"# Define models\n",
"models = {\n",
" \"Logistic Regression\": LogisticRegression(max_iter=1000),\n",
" \"Random Forest\": RandomForestClassifier(n_estimators=100, random_state=42),\n",
" \"XGBoost\": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),\n",
" \"LightGBM\": LGBMClassifier()\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b9504216",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\\nLogistic Regression\n",
" precision recall f1-score support\n",
"\n",
" 0 0.80 0.95 0.87 1931\n",
" 1 0.94 0.76 0.84 1930\n",
"\n",
" accuracy 0.86 3861\n",
" macro avg 0.87 0.86 0.86 3861\n",
"weighted avg 0.87 0.86 0.86 3861\n",
"\n",
"AUC-ROC: 0.8519959321997514\n",
"\\nRandom Forest\n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 0.98 0.98 1931\n",
" 1 0.98 0.97 0.97 1930\n",
"\n",
" accuracy 0.98 3861\n",
" macro avg 0.98 0.98 0.98 3861\n",
"weighted avg 0.98 0.98 0.98 3861\n",
"\n",
"AUC-ROC: 0.9969229881695705\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\xgboost\\training.py:183: UserWarning: [12:43:21] WARNING: C:\\actions-runner\\_work\\xgboost\\xgboost\\src\\learner.cc:738: \n",
"Parameters: { \"use_label_encoder\" } are not used.\n",
"\n",
" bst.update(dtrain, iteration=i, fobj=obj)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\\nXGBoost\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.98 0.98 1931\n",
" 1 0.98 0.98 0.98 1930\n",
"\n",
" accuracy 0.98 3861\n",
" macro avg 0.98 0.98 0.98 3861\n",
"weighted avg 0.98 0.98 0.98 3861\n",
"\n",
"AUC-ROC: 0.9981085802142841\n",
"[LightGBM] [Info] Number of positive: 7721, number of negative: 7720\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000341 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 1896\n",
"[LightGBM] [Info] Number of data points in the train set: 15441, number of used features: 10\n",
"[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500032 -> initscore=0.000130\n",
"[LightGBM] [Info] Start training from score 0.000130\n",
"\\nLightGBM\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.98 0.98 1931\n",
" 1 0.98 0.98 0.98 1930\n",
"\n",
" accuracy 0.98 3861\n",
" macro avg 0.98 0.98 0.98 3861\n",
"weighted avg 0.98 0.98 0.98 3861\n",
"\n",
"AUC-ROC: 0.9980592084962286\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:2739: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names\n",
" warnings.warn(\n",
"c:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:2739: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names\n",
" warnings.warn(\n"
]
}
],
"source": [
"# Cell 4: Model Training & Evaluation\n",
"results = {}\n",
"for name, model in models.items():\n",
" pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])\n",
" pipe.fit(X_train, y_train)\n",
" y_pred = pipe.predict(X_test)\n",
" y_prob = pipe.predict_proba(X_test)[:,1]\n",
"\n",
" print(f\"\\\\n{name}\")\n",
" print(classification_report(y_test, y_pred))\n",
" print(\"AUC-ROC:\", roc_auc_score(y_test, y_prob))\n",
"\n",
" results[name] = {\n",
" \"model\": pipe,\n",
" \"roc_auc\": roc_auc_score(y_test, y_prob)\n",
" }\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "097ea9bf",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAArMAAAIdCAYAAAAu8fWFAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQAAWetJREFUeJzt3QmcjfX7//HLkq0slZDlmyVryhppV4pCFL4iSyqlklBZUpZkLUu2hBQiSmgjSWlDyhZlCWXJnn0X9//x/vz+9/meGTPMYObM7byej8dh5j7L3HPmPudc9/W5Ptcnled5ngEAAAABlDrSOwAAAACcLYJZAAAABBbBLAAAAAKLYBYAAACBRTALAACAwCKYBQAAQGARzAIAACCwCGYBAAAQWASzAAAACCyCWeAClCpVKuvatWui7/fXX3+5+7777rtJsl8Xmvz589vDDz9sKdHtt99uJUuWjPRuAECSI5gFkogCQgWGuvzwww+nXK+VpPPly+eur1GjhgXRtm3b7Pnnn7dixYpZpkyZ7OKLL7Zy5crZq6++anv27In07iEZ+Me4f8mSJYvddttt9vnnn8d7n99++80aNWpkefLksfTp01vu3LntoYcectvjs3btWnviiSesYMGCliFDBvdzbrrpJnvjjTfs8OHDCd7fYcOGuf2sWLHiaU/oXn/99Tiv13Zdr9vFNnXqVLvnnnsse/bsli5dOvd7/fe//7Wvv/76jPt14MAB69KlizsB0evo8ssvt9KlS9uzzz5rmzdvTvDvB0SjtJHeAeBCpw/eCRMm2M033xxj+7fffmubNm1yH+ZB9PPPP9u9997rPoQVmCiIlV9++cV69+5t3333nX355Zd2IVu1apWlTk1O4K677rImTZq4E7T169fbm2++aTVr1rQZM2ZY1apVY9x2ypQp1qBBA7vsssvs0UcftQIFCrjA8O2337bJkyfbxIkT7f77749xHwXG9erVc68V/RwFfMeOHXMniS+88IILgkeMGJGgfR0/frzLqC9YsMDWrFljV1999Tn//vq9H3nkEXcCW6ZMGWvbtq3lypXLtmzZ4gLcO++803788Ue78cYb47z/8ePH7dZbb7WVK1da06ZN7ZlnnnGvK/1eeu/Q86HAGEA8PABJ4p133vH0EnvggQe87Nmze8ePH49xffPmzb1y5cp5V111lVe9evXz+rP1c7t06ZLo+/3555/uvtr309m9e7eXJ08eL2fOnN6KFStOuX7r1q1e9+7dvQvRyZMnvUOHDnkp3W233eZdc801Sf5zdLw8/fTTMbb9/vvvbvs999wTY/uaNWu8TJkyecWKFfO2b98e47odO3a47RdffLG3du3a0PZ169Z5l1xyibtu8+bNp/z8P/74wxs4cGCC9lWPpf2aMmWKd8UVV3hdu3aN9zXw2muvxfkY2q7rdbvY21q3bu2Oj9jGjh3r/fTTT/Hu1wcffODuP378+FOuO3z4sLd3714vuRw4cCDZfhZwvpBSAJKYslD//POPzZo1K7RNWSVloRo2bBjnfQ4ePGjPPfecK0NQNqpo0aJuePP/Yof/OXr0qLVp08auuOIKy5w5s913330u2xuXv//+22WPcubM6R7zmmuusdGjR5/V7/TWW2+5x+vfv78rMYhNP+Oll146ZXhXP9MfVn766adPKUXw6zx//fVXN1St0gVlzvRc+dlsDQ9nzJjRPSdfffVVjPurTlhDwMpwaXhXQ9EartVQ7ZEjR2Lc9p133rE77rjDcuTI4fapRIkSLqMYm7J4KgOZOXOmlS9f3v1s/f5x1cwqw9atWzcrXLiwy8jrZysjH/63Fw0733LLLW44OVu2bFarVi1bsWJFnL+Lsof6Gbpd1qxZrVmzZnbo0CFLqIULF7qMoPZbWdDhw4eHrlP2T/ug5yc2HUdp0qSxXr16WWIVL17cDbWrNCDca6+95vZdWVQds+F0ez2vOvb79u0b2q6vtZ/K3F555ZWn/CwdH3Htf3xZ2UsvvdSqV69udevWdd+fK5U46DnS68AvQYitcePGVqFChXgfw3+eVDYRm19SEc4/vvUc+q+FTp06xbjN4sWLXcmD7nvJJZe47PD8+fPjLIXS6+qpp55yr4W8efOGrldm3T9O9f6i5+10pSBApBDMAklMAU+lSpXs/fffj/EhsXfvXnvwwQdPub0CVgWlAwYMsGrVqrmAUR9WGk7V8GW4xx57zAYOHGh33323G9q/6KKL3AdOXLWtN9xwgwv+WrZs6eoMFQRomFf3T6xPPvnEfYgqIEgIBWYKXhXE9uvXz+rUqeMCF+23AsBwu3fvdsGjglYFMgo09TxNmjTJ/a/SBv2uCnr08/fv33/Kz9MHvYJXBRm6/aBBg+zxxx+PcRsFrldddZW9+OKLbp904qAP9KFDh8ZZTqCTEg2n67lTLWN8v6eC2cqVK9uQIUNcgPGf//zHFi1aFLqN/gYaet++fbu7vf6mc+fOdYFMXHWY+l30O+p30dcKQPQzEkLPpX5/lYDouVSg8uSTT4ZOYhTkaAhbz+2JEydi3FfHq45F1bImlo5t/WwFjuE+/fRT93pQgBQXDbXr+vB6W91HdbLxDdEnhoLXBx54wNWz6u/5xx9/uHKZc6FSh127drkTUwX/Z0PHoYwdO/aUE9bYdKKn14ZOiJo3b+6Ox9q1a7vnyaeAU8/x0qVLrV27dvbyyy/bn3/+6U4Wf/rpp1MeU8f977//bp07d7YOHTq4bePGjXPvJTpG+vTp4x5Dt9HJWVzHKRBR5y3HCyDOMoOff/7ZGzJkiJc5c+bQ8HS9evW8ypUru69jlxlMmzbN3e/VV1+N8Xh169b1UqVK5YZqZcmSJe52Tz31VIzbNWzY8JQyg0cffdS78sorvZ07d8a47YMPPuhlzZo1tF8JLTO49NJLvVKlSiXoedBwcrp06by7777bO3HiRGi7nhP9rNGjR8cYGte2CRMmhLatXLnSbUudOrU3f/780PaZM2eesq/6nbXtvvvui7EPeo60fenSpaFtcZUKVK1a1StYsGCMbfr76L5ffPHFKbfXdU2bNg19r+fkTCUjpUuX9nLkyOH9888/oW3aL/1+TZo0OeV3eeSRR2Lc//777/cuv/xy70z857Jfv36hbUePHg39/GPHjsV4HmfMmBHj/tddd517jDPRfXV8qUxAf+tffvnFq1at2ilD9Xv27HHbatWqddrH099Ot9u3b58bXk/IfRJC+6XHmjVrlvte5QB58+b1nn322XMqM3jjjTfc91OnTj3rfdOxWLRoUfc4OqYefvhh7+233/a2bdt2ym1vvfVW916yfv36GNvDyxtq167tXnPh5Roq0dD9dP/Y71E333yz9++//4a279+/38uWLZsrhYpdPqT3i9jbgUgjMwskA2XUNBz52WefuSyb/o+vxGD69Okuw9OqVasY21V2oNhBWV3/dhL7dq1bt47xve7z0UcfuQk5+nrnzp2hizKEyqKFZw4TYt++fW7YMSGUiVRZhfYrfLKUskoaAo09612ZoPCMtbLSGmLX0HX4DHT/63Xr1p3yM5UFDqcJNeHPmSiz7NNzoOdDpQ16PH0fTsPzsScyxUX7qayYMn5x0YSgJUuWuLIBTYDyXXfddS7rG75/vhYtWsT4Xhk3la3ob3AmadOmdR0AfMpI6ntlhVV+IFWqVHEZ8/Ah9+XLl7sMoCb2JYRKADTkrWFqlWLMnj3bZQTDRxL8DPqZjhv/ev1+/u+Y0GPtdPT7qfxFWXPR8Hr9+vXdhLPYWenEOB/7qGNRGVONvoiy7xo1UVmFjl2VE8mOHTvcxEqVCynjH84vb9DvoomXytYqo+3TY+k9R5nk2MeOXovhWWWVxagESNnr8PcL3Uavu2+++easf1cgKRDMAslAH/QKGjQzWbO59YET3xC9ZoMruIj94ahgzr/e/1/BYaFChWLcTsFfOH0A6oPJr1MMv6j+UhTcJIaC0LiG9+P7feLaLwVW+rD1r/dpKDx23aFqRVUGEHubaDg7NtWshtNzpOcqfHhUs8v1N/HrVvV8qORA4gpmE+KVV15xz3WRIkXs2muvdcG
"text/plain": [
"<Figure size 800x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Cell 5: Compare ROC-AUC\n",
"scores = {k: v[\"roc_auc\"] for k, v in results.items()}\n",
"\n",
"plt.figure(figsize=(8,5))\n",
"sns.barplot(x=list(scores.keys()), y=list(scores.values()))\n",
"plt.title(\"Model Comparison by ROC-AUC Score\")\n",
"plt.ylabel(\"ROC-AUC\")\n",
"plt.xticks(rotation=45)\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "830073d4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best model based on AUC-ROC: XGBoost\n"
]
}
],
"source": [
"# Cell 6: Best Model Summary\n",
"best_model_name = max(scores, key=scores.get)\n",
"print(f\"Best model based on AUC-ROC: {best_model_name}\")\n",
"best_model = results[best_model_name][\"model\"]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb18adc2",
"metadata": {},
"outputs": [],
"source": [
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}