First commit

Defined file structure and completed EDA
This commit is contained in:
boladeE
2025-04-24 23:39:36 +01:00
commit 50e95445fb
21 changed files with 1514 additions and 0 deletions
+159
View File
@@ -0,0 +1,159 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "2c5baf8e",
"metadata": {},
"source": [
"# 📊 Exploratory Data Analysis: Fraud Detection Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f3e6a97",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"df = pd.read_csv(\"fraudTest.csv\")\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "2bcadae6",
"metadata": {},
"source": [
"## 🧾 Basic Overview of the Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "820cb0e9",
"metadata": {},
"outputs": [],
"source": [
"print(\"Shape:\", df.shape)\n",
"print(\"\\nData Types:\\n\", df.dtypes)\n",
"print(\"\\nMissing Values:\\n\", df.isnull().sum())\n",
"print(\"\\nDuplicate Rows:\", df.duplicated().sum())"
]
},
{
"cell_type": "markdown",
"id": "caa22db9",
"metadata": {},
"source": [
"## ⚖️ Class Balance"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fb75259",
"metadata": {},
"outputs": [],
"source": [
"sns.countplot(data=df, x=\"is_fraud\")\n",
"plt.title(\"Fraud vs Non-Fraud Transactions\")\n",
"plt.show()\n",
"\n",
"fraud_ratio = df[\"is_fraud\"].mean()\n",
"print(f\"Fraudulent transactions: {fraud_ratio:.4%}\")"
]
},
{
"cell_type": "markdown",
"id": "658e9cd2",
"metadata": {},
"source": [
"## 📊 Statistical Summary"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "202e2612",
"metadata": {},
"outputs": [],
"source": [
"df.describe(include='all')"
]
},
{
"cell_type": "markdown",
"id": "12d24a95",
"metadata": {},
"source": [
"## 🔗 Correlation Matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c02acf0",
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(12, 8))\n",
"sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=\".2f\", cmap=\"coolwarm\")\n",
"plt.title(\"Feature Correlation Matrix\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "fce8183a",
"metadata": {},
"source": [
"## 💵 Transaction Amount Distribution by Fraud"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea72b131",
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(data=df, x='is_fraud', y='amt')\n",
"plt.yscale('log')\n",
"plt.title(\"Transaction Amount by Fraud Status\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "a7d7d378",
"metadata": {},
"source": [
"## 🕒 Transaction Timing (Hourly)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f26f36f",
"metadata": {},
"outputs": [],
"source": [
"df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])\n",
"df['hour'] = df['trans_date_trans_time'].dt.hour\n",
"\n",
"plt.figure(figsize=(12, 6))\n",
"sns.histplot(data=df, x='hour', hue='is_fraud', multiple='stack', bins=24)\n",
"plt.title(\"Transaction Hour Distribution\")\n",
"plt.show()"
]
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
File diff suppressed because one or more lines are too long
+156
View File
@@ -0,0 +1,156 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# feature_engineering_experiments.ipynb\n",
"\n",
"# Import libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
"from sklearn.model_selection import train_test_split\n",
"from datetime import datetime\n",
"\n",
"# Load data\n",
"df = pd.read_csv('../data/raw/fraudTrain.csv')\n",
"\n",
"# Basic preprocessing\n",
"df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])\n",
"df['dob'] = pd.to_datetime(df['dob'])\n",
"\n",
"# Experiment 1: Basic Features\n",
"def create_basic_features(df):\n",
" # Time-based features\n",
" df['hour'] = df['trans_date_trans_time'].dt.hour\n",
" df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek\n",
" df['month'] = df['trans_date_trans_time'].dt.month\n",
" \n",
" # Age feature\n",
" df['dob'] = pd.to_datetime(df['dob'])\n",
" reference_date = pd.to_datetime('2020-06-21')\n",
" df['age'] = (reference_date - df['dob']).dt.days // 365\n",
" \n",
" # Distance between merchant and customer\n",
" df['distance'] = np.sqrt((df['merch_lat'] - df['lat'])**2 + (df['merch_long'] - df['long'])**2)\n",
" \n",
" # Categorical encoding\n",
" cat_cols = ['category', 'gender', 'state']\n",
" for col in cat_cols:\n",
" le = LabelEncoder()\n",
" df[col+'_encoded'] = le.fit_transform(df[col])\n",
" \n",
" return df\n",
"\n",
"# Experiment 2: Transaction Patterns\n",
"def create_transaction_patterns(df):\n",
" # Transaction frequency per customer\n",
" trans_count = df.groupby('cc_num')['trans_num'].count().reset_index()\n",
" trans_count.columns = ['cc_num', 'trans_count']\n",
" df = df.merge(trans_count, on='cc_num', how='left')\n",
" \n",
" # Average transaction amount per customer\n",
" avg_amount = df.groupby('cc_num')['amt'].mean().reset_index()\n",
" avg_amount.columns = ['cc_num', 'avg_trans_amount']\n",
" df = df.merge(avg_amount, on='cc_num', how='left')\n",
" \n",
" # Difference from average amount\n",
" df['amt_diff_from_avg'] = df['amt'] - df['avg_trans_amount']\n",
" \n",
" return df\n",
"\n",
"# Experiment 3: Time-based Features\n",
"def create_time_features(df):\n",
" # Time since last transaction\n",
" df = df.sort_values(['cc_num', 'trans_date_trans_time'])\n",
" df['time_since_last'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds() / 60\n",
" \n",
" # Fill NA for first transactions\n",
" df['time_since_last'] = df['time_since_last'].fillna(24*60) # Assume 24 hours if first transaction\n",
" \n",
" # Transaction velocity (transactions per hour)\n",
" df['trans_velocity'] = 60 / df['time_since_last'] # transactions per hour\n",
" \n",
" return df\n",
"\n",
"# Experiment 4: Merchant Behavior\n",
"def create_merchant_features(df):\n",
" # Merchant transaction count\n",
" merchant_counts = df['merchant'].value_counts().reset_index()\n",
" merchant_counts.columns = ['merchant', 'merchant_trans_count']\n",
" df = df.merge(merchant_counts, on='merchant', how='left')\n",
" \n",
" # Merchant fraud rate\n",
" merchant_fraud = df.groupby('merchant')['is_fraud'].mean().reset_index()\n",
" merchant_fraud.columns = ['merchant', 'merchant_fraud_rate']\n",
" df = df.merge(merchant_fraud, on='merchant', how='left')\n",
" \n",
" return df\n",
"\n",
"# Apply all feature engineering steps\n",
"df_features = create_basic_features(df)\n",
"df_features = create_transaction_patterns(df_features)\n",
"df_features = create_time_features(df_features)\n",
"df_features = create_merchant_features(df_features)\n",
"\n",
"# Select final features\n",
"features = ['amt', 'hour', 'day_of_week', 'month', 'age', 'distance',\n",
" 'category_encoded', 'gender_encoded', 'state_encoded',\n",
" 'trans_count', 'avg_trans_amount', 'amt_diff_from_avg',\n",
" 'time_since_last', 'trans_velocity', 'merchant_trans_count',\n",
" 'merchant_fraud_rate', 'city_pop']\n",
"\n",
"X = df_features[features]\n",
"y = df_features['is_fraud']\n",
"\n",
"# Split data\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)\n",
"\n",
"X_train.replace([np.inf, -np.inf], np.nan, inplace=True)\n",
"X_test.replace([np.inf, -np.inf], np.nan, inplace=True)\n",
"X_train.dropna(inplace=True)\n",
"# Scale numerical features\n",
"scaler = StandardScaler()\n",
"X_train_scaled = scaler.fit_transform(X_train)\n",
"X_test_scaled = scaler.transform(X_test)\n",
"\n",
"# Save processed data for modeling\n",
"pd.DataFrame(X_train_scaled, columns=features).to_csv('X_train.csv', index=False)\n",
"pd.DataFrame(X_test_scaled, columns=features).to_csv('X_test.csv', index=False)\n",
"y_train.to_csv('y_train.csv', index=False)\n",
"y_test.to_csv('y_test.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
+215
View File
@@ -0,0 +1,215 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Class distribution in training set:\n",
"is_fraud\n",
"0 902418\n",
"1 5254\n",
"Name: count, dtype: int64\n",
"\n",
"Class distribution in test set:\n",
"is_fraud\n",
"0 386751\n",
"1 2252\n",
"Name: count, dtype: int64\n",
"📊 Evaluating Baseline Models:\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1408: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" y = column_or_1d(y, warn=True)\n"
]
},
{
"ename": "ValueError",
"evalue": "Found input variables with inconsistent numbers of samples: [907658, 907672]",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 80\u001b[39m\n\u001b[32m 78\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m📊 Evaluating Baseline Models:\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 79\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m model \u001b[38;5;129;01min\u001b[39;00m models:\n\u001b[32m---> \u001b[39m\u001b[32m80\u001b[39m \u001b[43mevaluate_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_test\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 82\u001b[39m \u001b[38;5;66;03m# ⚖️ SMOTE Experiment\u001b[39;00m\n\u001b[32m 83\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m📈 Experiment with SMOTE for class imbalance:\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mevaluate_model\u001b[39m\u001b[34m(model, X_train, X_test, y_train, y_test)\u001b[39m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mevaluate_model\u001b[39m(model, X_train, X_test, y_train, y_test):\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 40\u001b[39m y_pred = model.predict(X_test)\n\u001b[32m 41\u001b[39m y_prob = model.predict_proba(X_test)[:, \u001b[32m1\u001b[39m]\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\base.py:1389\u001b[39m, in \u001b[36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(estimator, *args, **kwargs)\u001b[39m\n\u001b[32m 1382\u001b[39m estimator._validate_params()\n\u001b[32m 1384\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[32m 1385\u001b[39m skip_parameter_validation=(\n\u001b[32m 1386\u001b[39m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[32m 1387\u001b[39m )\n\u001b[32m 1388\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1389\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1222\u001b[39m, in \u001b[36mLogisticRegression.fit\u001b[39m\u001b[34m(self, X, y, sample_weight)\u001b[39m\n\u001b[32m 1219\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1220\u001b[39m _dtype = [np.float64, np.float32]\n\u001b[32m-> \u001b[39m\u001b[32m1222\u001b[39m X, y = \u001b[43mvalidate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1223\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 1224\u001b[39m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1225\u001b[39m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1226\u001b[39m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcsr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 1227\u001b[39m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43m_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1228\u001b[39m \u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mC\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 1229\u001b[39m \u001b[43m \u001b[49m\u001b[43maccept_large_sparse\u001b[49m\u001b[43m=\u001b[49m\u001b[43msolver\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mliblinear\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msag\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msaga\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1230\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1231\u001b[39m check_classification_targets(y)\n\u001b[32m 1232\u001b[39m \u001b[38;5;28mself\u001b[39m.classes_ = np.unique(y)\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:2961\u001b[39m, in \u001b[36mvalidate_data\u001b[39m\u001b[34m(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)\u001b[39m\n\u001b[32m 2959\u001b[39m y = check_array(y, input_name=\u001b[33m\"\u001b[39m\u001b[33my\u001b[39m\u001b[33m\"\u001b[39m, **check_y_params)\n\u001b[32m 2960\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m2961\u001b[39m X, y = \u001b[43mcheck_X_y\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mcheck_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2962\u001b[39m out = X, y\n\u001b[32m 2964\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m check_params.get(\u001b[33m\"\u001b[39m\u001b[33mensure_2d\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m):\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:1389\u001b[39m, in \u001b[36mcheck_X_y\u001b[39m\u001b[34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[39m\n\u001b[32m 1370\u001b[39m X = check_array(\n\u001b[32m 1371\u001b[39m X,\n\u001b[32m 1372\u001b[39m accept_sparse=accept_sparse,\n\u001b[32m (...)\u001b[39m\u001b[32m 1384\u001b[39m input_name=\u001b[33m\"\u001b[39m\u001b[33mX\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 1385\u001b[39m )\n\u001b[32m 1387\u001b[39m y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)\n\u001b[32m-> \u001b[39m\u001b[32m1389\u001b[39m \u001b[43mcheck_consistent_length\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1391\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m X, y\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\babaw\\Documents\\Work\\Mana Knight Digital\\task_fraud_detection\\.venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:475\u001b[39m, in \u001b[36mcheck_consistent_length\u001b[39m\u001b[34m(*arrays)\u001b[39m\n\u001b[32m 473\u001b[39m uniques = np.unique(lengths)\n\u001b[32m 474\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(uniques) > \u001b[32m1\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m475\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 476\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mFound input variables with inconsistent numbers of samples: \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 477\u001b[39m % [\u001b[38;5;28mint\u001b[39m(l) \u001b[38;5;28;01mfor\u001b[39;00m l \u001b[38;5;129;01min\u001b[39;00m lengths]\n\u001b[32m 478\u001b[39m )\n",
"\u001b[31mValueError\u001b[39m: Found input variables with inconsistent numbers of samples: [907658, 907672]"
]
}
],
"source": [
"# model_training_experiment.ipynb\n",
"\n",
"# 📦 Import libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import (\n",
" accuracy_score, precision_score, recall_score, \n",
" f1_score, roc_auc_score, confusion_matrix, \n",
" classification_report, roc_curve\n",
")\n",
"\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"from xgboost import XGBClassifier\n",
"\n",
"from imblearn.over_sampling import SMOTE\n",
"from imblearn.pipeline import Pipeline as ImbPipeline\n",
"import joblib\n",
"\n",
"# 📂 Load processed data\n",
"X_train = pd.read_csv('X_train.csv')\n",
"X_test = pd.read_csv('X_test.csv')\n",
"y_train = pd.read_csv('y_train.csv')\n",
"y_test = pd.read_csv('y_test.csv')\n",
"\n",
"# 🧪 Check class distribution\n",
"print(\"Class distribution in training set:\")\n",
"print(y_train.value_counts())\n",
"print(\"\\nClass distribution in test set:\")\n",
"print(y_test.value_counts())\n",
"\n",
"# ⚙️ Evaluation Function\n",
"def evaluate_model(model, X_train, X_test, y_train, y_test):\n",
" model.fit(X_train, y_train)\n",
" y_pred = model.predict(X_test)\n",
" y_prob = model.predict_proba(X_test)[:, 1]\n",
"\n",
" print(f\"\\n🔍 Model: {model.__class__.__name__}\")\n",
" print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
" print(\"Precision:\", precision_score(y_test, y_pred))\n",
" print(\"Recall:\", recall_score(y_test, y_pred))\n",
" print(\"F1 Score:\", f1_score(y_test, y_pred))\n",
" print(\"ROC AUC:\", roc_auc_score(y_test, y_prob))\n",
"\n",
" # Confusion Matrix\n",
" cm = confusion_matrix(y_test, y_pred)\n",
" sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\n",
" plt.title('Confusion Matrix')\n",
" plt.xlabel('Predicted')\n",
" plt.ylabel('Actual')\n",
" plt.show()\n",
"\n",
" # ROC Curve\n",
" fpr, tpr, _ = roc_curve(y_test, y_prob)\n",
" plt.plot(fpr, tpr, label=\"ROC Curve\")\n",
" plt.plot([0, 1], [0, 1], 'k--')\n",
" plt.xlabel('False Positive Rate')\n",
" plt.ylabel('True Positive Rate')\n",
" plt.title('ROC Curve')\n",
" plt.legend()\n",
" plt.show()\n",
" \n",
" return model\n",
"\n",
"# ⚗️ Baseline Models\n",
"models = [\n",
" LogisticRegression(max_iter=1000, random_state=42),\n",
" RandomForestClassifier(random_state=42),\n",
" GradientBoostingClassifier(random_state=42),\n",
" XGBClassifier(random_state=42, eval_metric='logloss')\n",
"]\n",
"\n",
"print(\"📊 Evaluating Baseline Models:\")\n",
"for model in models:\n",
" evaluate_model(model, X_train, X_test, y_train, y_test)\n",
"\n",
"# ⚖️ SMOTE Experiment\n",
"print(\"\\n📈 Experiment with SMOTE for class imbalance:\")\n",
"smote_pipeline = ImbPipeline([\n",
" ('smote', SMOTE(random_state=42)),\n",
" ('model', LogisticRegression(max_iter=1000, random_state=42))\n",
"])\n",
"evaluate_model(smote_pipeline, X_train, X_test, y_train, y_test)\n",
"\n",
"# 🔍 Hyperparameter Tuning (XGBoost)\n",
"print(\"\\n🔧 Hyperparameter tuning for XGBoost:\")\n",
"param_grid = {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__max_depth': [3, 5, 7],\n",
" 'model__learning_rate': [0.01, 0.1],\n",
" 'model__subsample': [0.8, 1.0],\n",
" 'model__colsample_bytree': [0.8, 1.0]\n",
"}\n",
"\n",
"grid_pipeline = ImbPipeline([\n",
" ('smote', SMOTE(random_state=42)),\n",
" ('model', XGBClassifier(random_state=42, eval_metric='logloss'))\n",
"])\n",
"\n",
"cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)\n",
"grid_search = GridSearchCV(grid_pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Best parameters:\", grid_search.best_params_)\n",
"print(\"Best ROC AUC from CV:\", grid_search.best_score_)\n",
"\n",
"# 🏆 Evaluate Best Model\n",
"best_model = grid_search.best_estimator_\n",
"evaluate_model(best_model, X_train, X_test, y_train, y_test)\n",
"\n",
"# 🌟 Feature Importance\n",
"model_step = best_model.named_steps['model']\n",
"if hasattr(model_step, 'feature_importances_'):\n",
" importances = model_step.feature_importances_\n",
" features = X_train.columns\n",
" feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})\n",
" feature_importance = feature_importance.sort_values('Importance', ascending=False)\n",
"\n",
" plt.figure(figsize=(12, 8))\n",
" sns.barplot(x='Importance', y='Feature', data=feature_importance)\n",
" plt.title('Feature Importance')\n",
" plt.show()\n",
"\n",
"# 💾 Save Best Model\n",
"joblib.dump(best_model, 'best_fraud_detection_model.pkl')\n",
"print(\"✅ Best model saved as 'best_fraud_detection_model.pkl'\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}