{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Class distribution in training set:\n", "is_fraud\n", "0 902418\n", "1 5254\n", "Name: count, dtype: int64\n", "\n", "Class distribution in test set:\n", "is_fraud\n", "0 386751\n", "1 2252\n", "Name: count, dtype: int64\n", "๐Ÿ“Š Evaluating Baseline Models:\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1408: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " y = column_or_1d(y, warn=True)\n" ] }, { "ename": "ValueError", "evalue": "Found input variables with inconsistent numbers of samples: [907658, 907672]", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 80\u001b[39m\n\u001b[32m 78\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m๐Ÿ“Š Evaluating Baseline Models:\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 79\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m model \u001b[38;5;129;01min\u001b[39;00m models:\n\u001b[32m---> \u001b[39m\u001b[32m80\u001b[39m \u001b[43mevaluate_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_test\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 82\u001b[39m \u001b[38;5;66;03m# โš–๏ธ SMOTE Experiment\u001b[39;00m\n\u001b[32m 83\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m๐Ÿ“ˆ Experiment with SMOTE for class imbalance:\u001b[39m\u001b[33m\"\u001b[39m)\n", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mevaluate_model\u001b[39m\u001b[34m(model, X_train, X_test, y_train, y_test)\u001b[39m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mevaluate_model\u001b[39m(model, X_train, X_test, y_train, y_test):\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 40\u001b[39m y_pred = model.predict(X_test)\n\u001b[32m 41\u001b[39m y_prob = model.predict_proba(X_test)[:, \u001b[32m1\u001b[39m]\n", "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\base.py:1389\u001b[39m, in \u001b[36m_fit_context..decorator..wrapper\u001b[39m\u001b[34m(estimator, *args, **kwargs)\u001b[39m\n\u001b[32m 1382\u001b[39m estimator._validate_params()\n\u001b[32m 1384\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[32m 1385\u001b[39m skip_parameter_validation=(\n\u001b[32m 1386\u001b[39m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[32m 1387\u001b[39m )\n\u001b[32m 1388\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1389\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1222\u001b[39m, in \u001b[36mLogisticRegression.fit\u001b[39m\u001b[34m(self, X, y, sample_weight)\u001b[39m\n\u001b[32m 1219\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1220\u001b[39m _dtype = [np.float64, np.float32]\n\u001b[32m-> \u001b[39m\u001b[32m1222\u001b[39m X, y = \u001b[43mvalidate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1223\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 1224\u001b[39m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1225\u001b[39m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1226\u001b[39m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcsr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 1227\u001b[39m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43m_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1228\u001b[39m \u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mC\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 1229\u001b[39m \u001b[43m \u001b[49m\u001b[43maccept_large_sparse\u001b[49m\u001b[43m=\u001b[49m\u001b[43msolver\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mliblinear\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msaga\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1230\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1231\u001b[39m check_classification_targets(y)\n\u001b[32m 1232\u001b[39m \u001b[38;5;28mself\u001b[39m.classes_ = np.unique(y)\n", "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:2961\u001b[39m, in \u001b[36mvalidate_data\u001b[39m\u001b[34m(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)\u001b[39m\n\u001b[32m 2959\u001b[39m y = check_array(y, input_name=\u001b[33m\"\u001b[39m\u001b[33my\u001b[39m\u001b[33m\"\u001b[39m, **check_y_params)\n\u001b[32m 2960\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m2961\u001b[39m X, y = \u001b[43mcheck_X_y\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mcheck_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2962\u001b[39m out = X, y\n\u001b[32m 2964\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m check_params.get(\u001b[33m\"\u001b[39m\u001b[33mensure_2d\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m):\n", "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1389\u001b[39m, in \u001b[36mcheck_X_y\u001b[39m\u001b[34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[39m\n\u001b[32m 1370\u001b[39m X = check_array(\n\u001b[32m 1371\u001b[39m X,\n\u001b[32m 1372\u001b[39m accept_sparse=accept_sparse,\n\u001b[32m (...)\u001b[39m\u001b[32m 1384\u001b[39m input_name=\u001b[33m\"\u001b[39m\u001b[33mX\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 1385\u001b[39m )\n\u001b[32m 1387\u001b[39m y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)\n\u001b[32m-> \u001b[39m\u001b[32m1389\u001b[39m \u001b[43mcheck_consistent_length\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1391\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m X, y\n", "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:475\u001b[39m, in \u001b[36mcheck_consistent_length\u001b[39m\u001b[34m(*arrays)\u001b[39m\n\u001b[32m 473\u001b[39m uniques = np.unique(lengths)\n\u001b[32m 474\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(uniques) > \u001b[32m1\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m475\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 476\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mFound input variables with inconsistent numbers of samples: \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 477\u001b[39m % [\u001b[38;5;28mint\u001b[39m(l) \u001b[38;5;28;01mfor\u001b[39;00m l \u001b[38;5;129;01min\u001b[39;00m lengths]\n\u001b[32m 478\u001b[39m )\n", "\u001b[31mValueError\u001b[39m: Found input variables with inconsistent numbers of samples: [907658, 907672]" ] } ], "source": [ "# model_training_experiment.ipynb\n", "\n", "# ๐Ÿ“ฆ Import libraries\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.metrics import (\n", " accuracy_score, precision_score, recall_score, \n", " f1_score, roc_auc_score, confusion_matrix, \n", " classification_report, roc_curve\n", ")\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", "from xgboost import XGBClassifier\n", "\n", "from imblearn.over_sampling import SMOTE\n", "from imblearn.pipeline import Pipeline as ImbPipeline\n", "import joblib\n", "\n", "# ๐Ÿ“‚ Load processed data\n", "X_train = pd.read_csv('X_train.csv')\n", "X_test = pd.read_csv('X_test.csv')\n", "y_train = pd.read_csv('y_train.csv')\n", "y_test = pd.read_csv('y_test.csv')\n", "\n", "# ๐Ÿงช Check class distribution\n", "print(\"Class distribution in training set:\")\n", "print(y_train.value_counts())\n", "print(\"\\nClass distribution in test set:\")\n", "print(y_test.value_counts())\n", "\n", "# โš™๏ธ Evaluation Function\n", "def evaluate_model(model, X_train, X_test, y_train, y_test):\n", " model.fit(X_train, y_train)\n", " y_pred = model.predict(X_test)\n", " y_prob = model.predict_proba(X_test)[:, 1]\n", "\n", " print(f\"\\n๐Ÿ” Model: {model.__class__.__name__}\")\n", " print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n", " print(\"Precision:\", precision_score(y_test, y_pred))\n", " print(\"Recall:\", recall_score(y_test, y_pred))\n", " print(\"F1 Score:\", f1_score(y_test, y_pred))\n", " print(\"ROC AUC:\", roc_auc_score(y_test, y_prob))\n", "\n", " # Confusion Matrix\n", " cm = confusion_matrix(y_test, y_pred)\n", " sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\n", " plt.title('Confusion Matrix')\n", " plt.xlabel('Predicted')\n", " plt.ylabel('Actual')\n", " plt.show()\n", "\n", " # ROC Curve\n", " fpr, tpr, _ = roc_curve(y_test, y_prob)\n", " plt.plot(fpr, tpr, label=\"ROC Curve\")\n", " plt.plot([0, 1], [0, 1], 'k--')\n", " plt.xlabel('False Positive Rate')\n", " plt.ylabel('True Positive Rate')\n", " plt.title('ROC Curve')\n", " plt.legend()\n", " plt.show()\n", " \n", " return model\n", "\n", "# โš—๏ธ Baseline Models\n", "models = [\n", " LogisticRegression(max_iter=1000, random_state=42),\n", " RandomForestClassifier(random_state=42),\n", " GradientBoostingClassifier(random_state=42),\n", " XGBClassifier(random_state=42, eval_metric='logloss')\n", "]\n", "\n", "print(\"๐Ÿ“Š Evaluating Baseline Models:\")\n", "for model in models:\n", " evaluate_model(model, X_train, X_test, y_train, y_test)\n", "\n", "# โš–๏ธ SMOTE Experiment\n", "print(\"\\n๐Ÿ“ˆ Experiment with SMOTE for class imbalance:\")\n", "smote_pipeline = ImbPipeline([\n", " ('smote', SMOTE(random_state=42)),\n", " ('model', LogisticRegression(max_iter=1000, random_state=42))\n", "])\n", "evaluate_model(smote_pipeline, X_train, X_test, y_train, y_test)\n", "\n", "# ๐Ÿ” Hyperparameter Tuning (XGBoost)\n", "print(\"\\n๐Ÿ”ง Hyperparameter tuning for XGBoost:\")\n", "param_grid = {\n", " 'model__n_estimators': [100, 200],\n", " 'model__max_depth': [3, 5, 7],\n", " 'model__learning_rate': [0.01, 0.1],\n", " 'model__subsample': [0.8, 1.0],\n", " 'model__colsample_bytree': [0.8, 1.0]\n", "}\n", "\n", "grid_pipeline = ImbPipeline([\n", " ('smote', SMOTE(random_state=42)),\n", " ('model', XGBClassifier(random_state=42, eval_metric='logloss'))\n", "])\n", "\n", "cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)\n", "grid_search = GridSearchCV(grid_pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)\n", "grid_search.fit(X_train, y_train)\n", "\n", "print(\"Best parameters:\", grid_search.best_params_)\n", "print(\"Best ROC AUC from CV:\", grid_search.best_score_)\n", "\n", "# ๐Ÿ† Evaluate Best Model\n", "best_model = grid_search.best_estimator_\n", "evaluate_model(best_model, X_train, X_test, y_train, y_test)\n", "\n", "# ๐ŸŒŸ Feature Importance\n", "model_step = best_model.named_steps['model']\n", "if hasattr(model_step, 'feature_importances_'):\n", " importances = model_step.feature_importances_\n", " features = X_train.columns\n", " feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})\n", " feature_importance = feature_importance.sort_values('Importance', ascending=False)\n", "\n", " plt.figure(figsize=(12, 8))\n", " sns.barplot(x='Importance', y='Feature', data=feature_importance)\n", " plt.title('Feature Importance')\n", " plt.show()\n", "\n", "# ๐Ÿ’พ Save Best Model\n", "joblib.dump(best_model, 'best_fraud_detection_model.pkl')\n", "print(\"โœ… Best model saved as 'best_fraud_detection_model.pkl'\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }