Files

711 lines
123 KiB
Plaintext
Raw Permalink Normal View History

2025-04-25 17:53:04 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "7f073598",
"metadata": {},
"source": [
"# Feature Engineering and Feature Selection for Fraud Detection"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1816729a",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
"from sklearn.feature_selection import VarianceThreshold, RFE\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.under_sampling import RandomUnderSampler"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5309a15c",
"metadata": {},
"outputs": [],
"source": [
"df_train = pd.read_csv('../data/raw/fraudTrain.csv')\n",
"df_test = pd.read_csv('../data/raw/fraudTest.csv')\n",
"\n",
"df = pd.concat([df_train, df_test])\n",
"X = df.drop(['is_fraud'], axis=1)\n",
"y = df['is_fraud']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd2b4e07",
"metadata": {},
"outputs": [],
"source": [
"rus = RandomUnderSampler(sampling_strategy=1)\n",
"X_res, y_res = rus.fit_resample(X, y)\n",
"\n",
"# Combine X_res and y_res into a single DataFrame\n",
"resampled_df = pd.concat([X_res, y_res], axis=1)\n",
"\n",
"# Save the combined DataFrame to a new file\n",
"resampled_df.to_csv('../data/processed/resampled_data.csv', index=False)\n",
"\n",
"ax = y_res.value_counts().plot.pie(autopct='%.2f')\n",
"_ = ax.set_title(\"Under-sampling\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "890be8a6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>trans_date_trans_time</th>\n",
" <th>cc_num</th>\n",
" <th>merchant</th>\n",
" <th>category</th>\n",
" <th>amt</th>\n",
" <th>first</th>\n",
" <th>last</th>\n",
" <th>gender</th>\n",
" <th>street</th>\n",
" <th>city</th>\n",
" <th>...</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>city_pop</th>\n",
" <th>job</th>\n",
" <th>dob</th>\n",
" <th>trans_num</th>\n",
" <th>unix_time</th>\n",
" <th>merch_lat</th>\n",
" <th>merch_long</th>\n",
" <th>is_fraud</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020-12-15 18:33:17</td>\n",
" <td>30551643947183</td>\n",
" <td>fraud_Dickinson Ltd</td>\n",
" <td>personal_care</td>\n",
" <td>6.15</td>\n",
" <td>Morgan</td>\n",
" <td>Smith</td>\n",
" <td>F</td>\n",
" <td>1441 Bradley Place</td>\n",
" <td>Grover</td>\n",
" <td>...</td>\n",
" <td>35.1836</td>\n",
" <td>-81.4552</td>\n",
" <td>5621</td>\n",
" <td>Toxicologist</td>\n",
" <td>1973-11-14</td>\n",
" <td>7cab35c172dc6c78f551e981fe426fc6</td>\n",
" <td>1387132397</td>\n",
" <td>35.292860</td>\n",
" <td>-81.937193</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2019-12-23 08:14:17</td>\n",
" <td>5359543825610251</td>\n",
" <td>fraud_Kilback LLC</td>\n",
" <td>grocery_pos</td>\n",
" <td>67.35</td>\n",
" <td>Michael</td>\n",
" <td>Francis</td>\n",
" <td>M</td>\n",
" <td>1833 Jeanette Stravenue</td>\n",
" <td>Belgrade</td>\n",
" <td>...</td>\n",
" <td>45.7801</td>\n",
" <td>-111.1439</td>\n",
" <td>18182</td>\n",
" <td>Engineer, drilling</td>\n",
" <td>1975-06-29</td>\n",
" <td>ea272415875381d29cf29aee11550672</td>\n",
" <td>1356250457</td>\n",
" <td>46.228116</td>\n",
" <td>-111.718928</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2019-07-22 01:42:56</td>\n",
" <td>3560318482131952</td>\n",
" <td>fraud_Lehner, Mosciski and King</td>\n",
" <td>misc_net</td>\n",
" <td>1342.69</td>\n",
" <td>William</td>\n",
" <td>Skinner</td>\n",
" <td>M</td>\n",
" <td>524 Wu Spurs Suite 894</td>\n",
" <td>Mount Hope</td>\n",
" <td>...</td>\n",
" <td>34.4793</td>\n",
" <td>-87.4769</td>\n",
" <td>1312</td>\n",
" <td>Librarian, academic</td>\n",
" <td>1955-02-01</td>\n",
" <td>964730a13ec2011d75a3db37c55e526d</td>\n",
" <td>1342921376</td>\n",
" <td>34.898759</td>\n",
" <td>-88.125374</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2019-08-05 15:21:55</td>\n",
" <td>4477156602511939689</td>\n",
" <td>fraud_Medhurst, Cartwright and Ebert</td>\n",
" <td>personal_care</td>\n",
" <td>28.45</td>\n",
" <td>Angela</td>\n",
" <td>Ross</td>\n",
" <td>F</td>\n",
" <td>0107 Clements Point</td>\n",
" <td>American Fork</td>\n",
" <td>...</td>\n",
" <td>40.3928</td>\n",
" <td>-111.7941</td>\n",
" <td>42384</td>\n",
" <td>Futures trader</td>\n",
" <td>1992-12-29</td>\n",
" <td>518f49a0d3106c3f538c2f70d2f12e8a</td>\n",
" <td>1344180115</td>\n",
" <td>41.170642</td>\n",
" <td>-111.052342</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2020-01-27 20:49:30</td>\n",
" <td>30175986190993</td>\n",
" <td>fraud_Barton LLC</td>\n",
" <td>kids_pets</td>\n",
" <td>45.49</td>\n",
" <td>Rebecca</td>\n",
" <td>Butler</td>\n",
" <td>F</td>\n",
" <td>0665 Lisa Alley</td>\n",
" <td>Winger</td>\n",
" <td>...</td>\n",
" <td>47.5375</td>\n",
" <td>-95.9941</td>\n",
" <td>516</td>\n",
" <td>Applications developer</td>\n",
" <td>1966-06-07</td>\n",
" <td>8979eac3cefcacee095981dc43cd125d</td>\n",
" <td>1359319770</td>\n",
" <td>47.489127</td>\n",
" <td>-95.926267</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" trans_date_trans_time cc_num \\\n",
"0 2020-12-15 18:33:17 30551643947183 \n",
"1 2019-12-23 08:14:17 5359543825610251 \n",
"2 2019-07-22 01:42:56 3560318482131952 \n",
"3 2019-08-05 15:21:55 4477156602511939689 \n",
"4 2020-01-27 20:49:30 30175986190993 \n",
"\n",
" merchant category amt first \\\n",
"0 fraud_Dickinson Ltd personal_care 6.15 Morgan \n",
"1 fraud_Kilback LLC grocery_pos 67.35 Michael \n",
"2 fraud_Lehner, Mosciski and King misc_net 1342.69 William \n",
"3 fraud_Medhurst, Cartwright and Ebert personal_care 28.45 Angela \n",
"4 fraud_Barton LLC kids_pets 45.49 Rebecca \n",
"\n",
" last gender street city ... lat \\\n",
"0 Smith F 1441 Bradley Place Grover ... 35.1836 \n",
"1 Francis M 1833 Jeanette Stravenue Belgrade ... 45.7801 \n",
"2 Skinner M 524 Wu Spurs Suite 894 Mount Hope ... 34.4793 \n",
"3 Ross F 0107 Clements Point American Fork ... 40.3928 \n",
"4 Butler F 0665 Lisa Alley Winger ... 47.5375 \n",
"\n",
" long city_pop job dob \\\n",
"0 -81.4552 5621 Toxicologist 1973-11-14 \n",
"1 -111.1439 18182 Engineer, drilling 1975-06-29 \n",
"2 -87.4769 1312 Librarian, academic 1955-02-01 \n",
"3 -111.7941 42384 Futures trader 1992-12-29 \n",
"4 -95.9941 516 Applications developer 1966-06-07 \n",
"\n",
" trans_num unix_time merch_lat merch_long \\\n",
"0 7cab35c172dc6c78f551e981fe426fc6 1387132397 35.292860 -81.937193 \n",
"1 ea272415875381d29cf29aee11550672 1356250457 46.228116 -111.718928 \n",
"2 964730a13ec2011d75a3db37c55e526d 1342921376 34.898759 -88.125374 \n",
"3 518f49a0d3106c3f538c2f70d2f12e8a 1344180115 41.170642 -111.052342 \n",
"4 8979eac3cefcacee095981dc43cd125d 1359319770 47.489127 -95.926267 \n",
"\n",
" is_fraud \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
"[5 rows x 22 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"\n",
"# Load data\n",
"df = resampled_df\n",
"df.drop(columns=['Unnamed: 0'], inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e798580f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 19302 entries, 0 to 19301\n",
"Data columns (total 22 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 trans_date_trans_time 19302 non-null object \n",
" 1 cc_num 19302 non-null int64 \n",
" 2 merchant 19302 non-null object \n",
" 3 category 19302 non-null object \n",
" 4 amt 19302 non-null float64\n",
" 5 first 19302 non-null object \n",
" 6 last 19302 non-null object \n",
" 7 gender 19302 non-null object \n",
" 8 street 19302 non-null object \n",
" 9 city 19302 non-null object \n",
" 10 state 19302 non-null object \n",
" 11 zip 19302 non-null int64 \n",
" 12 lat 19302 non-null float64\n",
" 13 long 19302 non-null float64\n",
" 14 city_pop 19302 non-null int64 \n",
" 15 job 19302 non-null object \n",
" 16 dob 19302 non-null object \n",
" 17 trans_num 19302 non-null object \n",
" 18 unix_time 19302 non-null int64 \n",
" 19 merch_lat 19302 non-null float64\n",
" 20 merch_long 19302 non-null float64\n",
" 21 is_fraud 19302 non-null int64 \n",
"dtypes: float64(5), int64(5), object(12)\n",
"memory usage: 3.2+ MB\n"
]
},
{
"data": {
"text/plain": [
"is_fraud\n",
"0 9651\n",
"1 9651\n",
"Name: count, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.info()\n",
"df.describe()\n",
"df['is_fraud'].value_counts()"
]
},
{
"cell_type": "markdown",
"id": "0ddb32f7",
"metadata": {},
"source": [
"## Feature Engineering"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "13345c23",
"metadata": {},
"outputs": [],
"source": [
"# Convert date fields\n",
"df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])\n",
"df['dob'] = pd.to_datetime(df['dob'])\n",
"\n",
"# Time-based features\n",
"df['hour'] = df['trans_date_trans_time'].dt.hour\n",
"df['day'] = df['trans_date_trans_time'].dt.day\n",
"df['month'] = df['trans_date_trans_time'].dt.month\n",
"df['weekday'] = df['trans_date_trans_time'].dt.weekday\n",
"\n",
"# Age\n",
"df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365\n",
"\n",
"# Drop high-cardinality or redundant columns\n",
"df = df.drop(columns=['trans_date_trans_time', 'dob', 'trans_num', 'unix_time', 'cc_num', 'street'])\n",
"\n",
"# Encode categorical variables\n",
"cat_cols = ['gender', 'category', 'job', 'merchant', 'first', 'last', 'city', 'state']\n",
"for col in cat_cols:\n",
" df[col] = LabelEncoder().fit_transform(df[col])\n"
]
},
{
"cell_type": "markdown",
"id": "9ee0a1d6",
"metadata": {},
"source": [
"## Feature Selection"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "8c12a37e",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABGMAAAOGCAYAAABIg127AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQAAxgRJREFUeJzs3Qd4VGXWwPEzCSShhl6lSJHeBERAAQUpKivqIqgrRcQPFRURkbhKVSmCiyCKsAqoKLYVsQGCYkFWmohUAUFEaYJ0SEjmfs95d2d2BpKQQDLz3pn/73muZO7cufNOScycnOJxHMcRAAAAAAAAhERMaO4GAAAAAAAAimAMAAAAAABACBGMAQAAAAAACCGCMQAAAAAAACFEMAYAAAAAACCECMYAAAAAAACEEMEYAAAAAACAECIYAwAAAAAAEEIEYwAAAAAAAEKIYAwAAMi2mTNnisfjkR07duTYOfVcek49NwAAQCQjGAMACNsH+fS2IUOG5Mp9fvvttzJ8+HA5dOiQ2Grbtm3yf//3f1KlShVJSEiQwoULS8uWLeW5556TkydPSqR44403ZOLEiWKTXr16ScGCBTO8Xt+b/fv3z9U1vPDCCwSiAACIEnnCvQAAQPQaOXKkXHzxxUH76tatm2vBmBEjRpgP3UWKFBHbfPzxx9K1a1eJj4+XHj16mOchJSVFvvnmG3nkkUdk/fr1Mm3aNImUYMy6detkwIABQfsrVapkgk558+aVaKTBmBIlSpj3KAAAiGwEYwAAYdOpUydp0qSJuNnx48elQIECF3SO7du3S/fu3U0w4vPPP5eyZcv6r7vvvvtk69atJlhzoRzHkVOnTkm+fPnOuk73x8XFSUxM+JJmNftEM4IAAAAiHWVKAABrffrpp3LllVeaYEehQoXkuuuuMxkigdauXWsyCXylPWXKlJE777xTDhw44D9Gy5M0u0RpJo6vJEp7lGTWp0T3620Dz6P7NmzYILfddpsULVpUrrjiCv/1r7/+ujRu3NgEO4oVK2YCLL/++us5H+e4cePk2LFj8vLLLwcFYnyqVasmDz74oP9yamqqjBo1SqpWrWoyaSpXriyPPfaYJCcnB91O919//fWyYMECE/TSdb300kuyZMkS8zjmzJkjjz/+uJQvX17y588vR44cMbf77rvvpGPHjpKYmGj2t27dWpYuXXrOx/HBBx+Y16hcuXJmXbo+XWdaWpr/mDZt2pjA0i+//OJ/HXSdKqPXQgNUvveBZjXdcMMNsnHjxqBjfK+NBq582U+6/t69e8uJEyckN+jzPWzYMPP66OOtUKGCDB48+KzXYcaMGXL11VdLqVKlzHG1a9eWF198MegYfQ70vf3ll1/6nxd9rgLL+jRL6oEHHpCSJUuax6clbZo9paV3mk2l70fddA0aeAs0fvx4adGihRQvXty8D/R9+u6772ZYjjV79mypUaOG+Z7SY7/66qtceQ4BAIhWZMYAAMLm8OHD8scffwTt0zIN9dprr0nPnj2lQ4cOMnbsWPOBWj/AavDj+++/93+A/+yzz+Tnn382H7o1EOMr59F///3vf5sPlzfddJP89NNP8uabb8o//vEP/33oh9r9+/dne91aTlS9enV5+umn/R96n3rqKXniiSfklltukbvuusucd/LkydKqVSuz3sxKoz788EMTTNIPy1mh5581a5b89a9/lYcfftgET0aPHm0CFO+//37QsZs3b5Zbb73VfHDv27ev+YDto4ESzYYZNGiQCSDo1xr40Iwl/QCugQbNlPEFE77++mu57LLLMlyXBg2078rAgQPNv3quoUOHmiDPM888Y475+9//bl73Xbt2mddCZdarZdGiRWY9+vxowEXLmPR51V46q1ev9r8PfPT514CbPh96/T//+U8TBNH3UFac+X7MiNfrlb/85S8mQHL33XdLrVq15McffzSPSd9rc+fO9R+r79s6deqY4/PkyWNe73vvvdecQzOflPbQuf/++81zoc+RKl26dNB96vX6HtdyO31v6/tc31daglexYkXzfvzkk0/Mc61lbhqg8dG+Q3r/t99+uwngaCBO38cfffSRCaAF0oDQW2+9ZQI/GjzS8ikNzi1fvjzXyggBAIg6DgAAITZjxgyNYKS7qaNHjzpFihRx+vbtG3S7PXv2OImJiUH7T5w4cdb533zzTXOur776yr/vmWeeMfu2b98edKxe1v26pjPp/mHDhvkv69e679Zbbw06bseOHU5sbKzz1FNPBe3/8ccfnTx58py1P9Dhw4fNOW+44QYnK9asWWOOv+uuu4L2Dxo0yOz//PPP/fsqVapk9s2fPz/o2C+++MLsr1KlStDz5/V6nerVqzsdOnQwX/voMRdffLFzzTXXnPUaBj6f6b0W//d//+fkz5/fOXXqlH/fddddZ9Z2pvRei4YNGzqlSpVyDhw44N/3ww8/ODExMU6PHj3Oem3uvPPOoHPeeOONTvHixZ1z6dmzZ4bvSd923333+Y9/7bXXzBq+/vrroPNMnTrVHLt06dJMnxd9jvX5D1SnTh2ndevWZx3re67PfF2aN2/ueDwep1+/fv59qampzkUXXXTWec5cQ0pKilO3bl3n6quvDtrve6wrV6707/vll1+chIQE81wCAICcQZkSACBspkyZYjJbAjel/2rphWZ0aKaCb4uNjZVmzZrJF1984T9HYP8T7Xuix11++eXmsmZG5IZ+/foFXf7Xv/5lshw0KyNwvZrFoBk0ges9k680SMuwskIzH5RmnwTSDBl1Zm8ZzRLR7KL0aOZR4PO3Zs0a2bJliynB0jIv3+PQvjht27Y1pSr6ODMSeK6jR4+a22p5kWY1bdq0SbJr9+7dZk1adqRlXz7169eXa665xv9cZPba6P3rY/E9z5nRkpwz34+B78tA77zzjsmGqVmzZtBrrhlEKqP3qC8bTEu/NKNLL2dVnz59TKaXj34vaPxE9/vo94iWpOm5AwWu4c8//zT3q89Net8jzZs3N5lRPpp1o6VhWu4WWHIGAADOH2VKAICw0ZKX9Br4akBA+T7YnklHPvscPHjQlG1o2cW+ffuCjsvOB93sOHMClK5XPxRr4CU9mU0H8j0WDV5khfZa0dIh7VMSSAM/WrKi12e21nM9Dl+QJiP6nGpfkvRoaZj2oNHypDODH+fzWvgeS2BplY8GQjQ4cGYDZQ0cBPKtVQMQge+b9Gggo127dllamz5XWhampW7pCXwvar8dLflatmzZWf1r9HnR3jZZceZj891Oe9WcuV8fbyAtR3ryySdNcCuwp01gcMcnvffxJZdcYtau5Xf6XgMAABeGYAwAwDq+7AvtG5PeBz/tu+Gj2SjaM0Mb9DZs2ND03NDba4+LzLI4MvswqjLLADhzGpHej55HGw7rB/ozZdYTRQME2vBWRz1nR0brPtdaM7vO93xpzxF9LtOT0WPRTCbN9tDHoyPLtXmvZppo5sWjjz6apdciJ6T3/KszG9peKH089erVk2effTbd630Bkm3btpmsIs2g0WN1v/bm0awe7S+Tneclo8eW3v7Ax6u9frRfjPYv0v4v2iRaA4TaC0jHjAMAgNAjGAMAsI5+kFfaeDWzTAX96//ixYtNZow2ij0zwyMrwQtf5oQGEwKdmWFyrvXqh1/NNNEMguzSiUfajFUzJ7REJDM6/lo/wOtj1OwQn71795rHoNdf6POuAZWsZoj46IQmLQfSki390B84tvt8A0m+x6JNiM+kZU/aiPlCx4pfyHP1ww8/mEBLZo9Hm/VqJsq8efOCMlvSK13L6vOSXe+9954JjGkmkTbk9dFgTHrS+/7RpsQ6WSujTCAAAJA99IwBAFhHe5xoQECnw5w+ffqs630TkHwZAWdmPehkmjP5PrSfGXTR+9EP9WeO7tUMgqzSaU26Fg0KnbkWvRw4Zjs9OopY16dTkjSocibNrtBpOOraa69N9zH6MjTOnIyTHdonRIMMOgZZR22fKbPJU+m9Fjq1J73nUR9rVsqWNINDM3R0clTg66ZZRAsXLvQ/F+GgGVm//fabTJ8+/azrdOKTlk9l9LzoY08vEKLPy5nvz5y
"text/plain": [
"<Figure size 1400x1000 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(14, 10))\n",
"sns.heatmap(df.corr(), cmap='coolwarm', annot=False)\n",
"plt.title(\"Feature Correlation Heatmap\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "91ac8fae",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Features remaining after variance thresholding: 20\n"
]
}
],
"source": [
"X = df.drop('is_fraud', axis=1)\n",
"y = df['is_fraud']\n",
"\n",
"selector = VarianceThreshold(threshold=0.01)\n",
"X_var = selector.fit_transform(X)\n",
"print(f\"Features remaining after variance thresholding: {X_var.shape[1]}\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8f9f120e",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA3UAAAKqCAYAAACdEXduAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQAAaHVJREFUeJzt3Qd8VFX6//EnISQQIKFXA0iR3otLrxIRUIqA4NLFRQWMCEgsQEBNRLCAqyKrBBBFlKWoQCiKIqJ0JHSQmIg0KQlFA4T5v57z25n/TEhCEhImN/N5v153ydy55czMjTvfPOec62Wz2WwCAAAAALAkb3c3AAAAAACQeYQ6AAAAALAwQh0AAAAAWBihDgAAAAAsjFAHAAAAABZGqAMAAAAACyPUAQAAAICFEeoAAAAAwMIIdQAAAABgYYQ6AMilBg8eLBUrVnR3MwAAQDYj1AHAbYqMjBQvLy/H4uPjI+XKlTOh6vjx4+5uXo59n5yXCRMmSE706quvyrJly9K1bUxMTKqv7x//+Ee2tO+PP/6QyZMny65duySnsb8f06dPF6tauXKleX8BIKfzcXcDACC3mDJlitx9993y999/y08//WRCzA8//CDR0dGSL18+dzcvx71PzmrXri05NdQ9/PDD0r1793Tv069fP3nggQdc1pUoUSLbQl1YWJipyNavXz9bzuHJNNT9+9//JtgByPEIdQCQRTp37iyNGzc2Pz/22GNSvHhxee2112TFihXSp08fdzcvR75PWeny5ctSoEABcbeGDRvKP//5T7Ey/cOEr6+veHt7ZoeenHItAUB6eeZ/rQHgDmjVqpX59+jRo451V69elYkTJ0qjRo0kMDDQfHHU7b799ttUu6598MEHUrlyZfHz85MmTZrI1q1bbzqXdhHUapdWBPXfpUuXpvpl9dlnn5WgoCBzvGrVqplz2Gw2l+303CNHjpTPP/9catasKfnz55dmzZrJnj17zPOzZ8+WKlWqmPO1bdvWtDerfPPNN+Y90femcOHC8tBDD8n+/ftdttHKibZx37590r9/fylSpIi0bNnS8fzHH39s3mNtd9GiReWRRx6RuLg4l2McPnxYevXqJaVLlzav46677jLbxcfHO94Dfb/mzZvn6EapXWpv14EDB0z1T9ul59WAq8Hf2blz52Ts2LFSp04dKViwoAQEBJgwvHv3bsc2GzZsMNeDGjJkiKONWiFWWr1Lqb36eenifBzdb9GiRfLiiy+arsP+/v6SkJBgnv/555/l/vvvN9errm/Tpo1s2rTptrrgagV79OjRpoKpn/G//vUv87tx4cIFGThwoPk8dRk/frzLten8e/Hmm29KhQoVzGesbdKKeFZeS/reaZVOOXeltdM2NG/eXIoVK2baoNfbF198cVMb7L9L9t9R/b2rVauWrF69+qZttbv2sGHDpGzZsmY7rWg/8cQT5r2x0/coJCTE8Tusv4f6x6MbN264HEs/T21ToUKFzPWj19Lbb7+d4c8MgDVQqQOAbGIPOvol0U6/KP/nP/8xXfSGDx8uFy9elA8//FCCg4Nly5YtN3Wh++STT8w2+qVXvxxOmzZNevbsKb/++qvkzZvXbLNmzRoTTjR8hYeHy9mzZ82XfA0pzvTL8YMPPmgCpH5x1HNFRUXJuHHjzJdJ/ZLsbOPGjSZsPPXUU+axHrtr167mi/a7774rTz75pJw/f960aejQoeYLdHpoaPrzzz9d1mlVU61bt86El0qVKpkv23/99ZfMmjVLWrRoITt27Lhp4pfevXtL1apVTTdJ+5f/V155RV566SVTHdWK6ZkzZ8wxWrduLTt37jRf7vVLsr7niYmJMmrUKBPs9D346quvzJdmDTALFiww+zdt2lQef/xxc2wN17dy5cqVm16fHk8/r71795rXosFJxxFq2Fi8eLHp3rlkyRLp0aOH2V4/Xw0B+vr0i/2pU6dMkNbwouFDv/TXqFHDdGXVPxJo++x/RNCgkRlTp0411TkNk/q+6M/6mernoeFg0qRJpnI3d+5cad++vbk+9L3JDPt7rl1Htauy/uFCP5cff/xRypcvbz5P7fr4+uuvmyCkQc/Z/Pnzze+FXptaVdSwom3SPzqUKlUqS66lBg0amO6ta9euNddCcnpO/X169NFHzfWkIUqPoddQly5dXLbVEPvf//7X/M5oyJo5c6b5nY2NjTWhUOm59P3U608/z+rVq5trUoOiXlP6eei/eg3oev1vgr5X+p6FhobKiRMn5K233jLH0jbrf2M6dOhgAp/SMKth/Omnn87UZwYgh7MBAG7L3LlzNU3Y1q1bZztz5owtLi7O9sUXX9hKlChh8/PzM4/trl+/bktMTHTZ//z587ZSpUrZhg4d6lh37Ngxc8xixYrZzp0751i/fPlys/7LL790rKtfv76tTJkytgsXLjjWrVmzxmxXoUIFx7ply5aZdS+//LLL+R9++GGbl5eX7ciRI451up22XdthN3v2bLO+dOnStoSEBMf60NBQs95527Tep5QW59dSsmRJ29mzZx3rdu/ebfP29rYNHDjQsW7SpElmv379+rmcIyYmxpYnTx7bK6+84rJ+z549Nh8fH8f6nTt3mv0///zzNNtcoEAB26BBg2zpYf/MUlq+/fZbs02HDh1sderUsf3999+O/W7cuGFr3ry5rWrVqo51+nxSUtJNx9fPZMqUKY51W7duNcfX9zY5/exTanubNm3MYqdt02NUqlTJduXKFZd2aZuCg4PNz3a6zd13322777770vV+vP766zddA8mP2axZM3MNjhgxwuV35a677nJpq/2Y+fPnt/3++++O9T///LNZ/8wzz2TZtaSeeuopl+vTmfN7pa5evWqrXbu2rX379i7rdX9fX1+X3y9th66fNWuWY522Sdumn2ly9vdq6tSp5po8dOiQy/MTJkww131sbKx5/PTTT9sCAgLMewjAM9D9EgCySMeOHU13Mu0Wpd3rtAqjlS7nilmePHnMX9yVdpfSbnbXr183XfC0epBc3759XSp99mqMVnKU/nVeZz4cNGiQqQbZ3XfffaZy50wrH3p+7fbmTLtj6nfPVatWuazXv/I7VzPuvfde869WGLTakHy9vU23ol3atJLgvDi/Fu32pl0T7erWrWtej7Y/uREjRrg81mqIvq9apdNqmX3RqpBWYezdXO3vlVYqtfqRlbTKkvz11atXz3zWWvnStmmVyd42raxq1VC7g9pnS9VudfbxbElJSWYb7Yap3WVTuk6ygl5D2o3QTj8LbZN2SdTz29urXVL12vj+++9v6vKXXlopdu7KqNeQXoO63k6vVf29SOm60sqmVjvttMKlx7BfI1lxLd2K83ulFWutQOvvZ0qfj/63wbnKq+3QLpH216bvo1Zmu3XrluJ4U/t7pd2h9Rz63wTn61uPr9eJfiZKq576Odl/twDkfnS/BIAsomHlnnvuMV/uPvroI/MFS7+cJ6djtGbMmGHGVl27ds2xPvmMkEq7VzmzBzz9Eql+++03868GluSSBwDdVrvtOQcypd34nI+V2rntQUhDa0rr7W26Ff0CntIXV/v5td3JaRs1gCWfwCL5e6YhRMNBSu+HsndZ1f3GjBkjb7zxhixcuNB8UdaudDrBiXM4zgw9t37JTk6712rbtGuoLik5ffq0CSv6JV+792k312PHjpkv7Hb27npZLaX30h72UqPXuvMfHdIrI9dWStdVSp+v/u5pV9asupZuRbtZvvzyyyY8andVO+ewmtrrVfq+2V+bdhHWrtm3mgVWP5Nffvkl1dlU9fpR2s1T3wvtfqrXU6dOncwfE3RsJIDciVAHAFnEOaxoJUEnW9Aqx8GDB02VxT6Bh1YP9Hkdy1ayZElTkdDxas4TqtjpcylJPrFJdkjt3O5sU1rVEqVhSL9Ua9UxpXbaPwelwVo/i+XLl5txiVrB1M9Bx3glH4+YFexVLR2zppW5lOikF0rHdWnw07GKOtZNq01audMJMtJbHUspXCgNiCm9Nym9l0rHtaV2uwTn9zO7rq07dV0lf/1p0fGE+kcAHaepwbtMmTLmDwY63lDHwWb
"text/plain": [
"<Figure size 1000x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"forest = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"forest.fit(X, y)\n",
"\n",
"importances = pd.Series(forest.feature_importances_, index=X.columns)\n",
"importances.sort_values().plot(kind='barh', figsize=(10, 8))\n",
"plt.title(\"Random Forest Feature Importances\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "9121633c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Top features selected by RFE:\n",
"['merchant', 'category', 'amt', 'last', 'city', 'city_pop', 'merch_lat', 'merch_long', 'hour', 'age']\n"
]
}
],
"source": [
"rfe = RFE(estimator=RandomForestClassifier(n_estimators=50, random_state=42), n_features_to_select=10)\n",
"rfe.fit(X, y)\n",
"\n",
"selected_features = X.columns[rfe.support_]\n",
"print(\"Top features selected by RFE:\")\n",
"print(selected_features.tolist())"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "784212b7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>merchant</th>\n",
" <th>category</th>\n",
" <th>amt</th>\n",
" <th>last</th>\n",
" <th>city</th>\n",
" <th>city_pop</th>\n",
" <th>merch_lat</th>\n",
" <th>merch_long</th>\n",
" <th>hour</th>\n",
" <th>age</th>\n",
" <th>is_fraud</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>136</td>\n",
" <td>10</td>\n",
" <td>6.15</td>\n",
" <td>409</td>\n",
" <td>317</td>\n",
" <td>5621</td>\n",
" <td>35.292860</td>\n",
" <td>-81.937193</td>\n",
" <td>18</td>\n",
" <td>47</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>316</td>\n",
" <td>4</td>\n",
" <td>67.35</td>\n",
" <td>144</td>\n",
" <td>62</td>\n",
" <td>18182</td>\n",
" <td>46.228116</td>\n",
" <td>-111.718928</td>\n",
" <td>8</td>\n",
" <td>44</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>383</td>\n",
" <td>8</td>\n",
" <td>1342.69</td>\n",
" <td>407</td>\n",
" <td>546</td>\n",
" <td>1312</td>\n",
" <td>34.898759</td>\n",
" <td>-88.125374</td>\n",
" <td>1</td>\n",
" <td>64</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>422</td>\n",
" <td>10</td>\n",
" <td>28.45</td>\n",
" <td>380</td>\n",
" <td>19</td>\n",
" <td>42384</td>\n",
" <td>41.170642</td>\n",
" <td>-111.052342</td>\n",
" <td>15</td>\n",
" <td>26</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>26</td>\n",
" <td>7</td>\n",
" <td>45.49</td>\n",
" <td>54</td>\n",
" <td>895</td>\n",
" <td>516</td>\n",
" <td>47.489127</td>\n",
" <td>-95.926267</td>\n",
" <td>20</td>\n",
" <td>53</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" merchant category amt last city city_pop merch_lat merch_long \\\n",
"0 136 10 6.15 409 317 5621 35.292860 -81.937193 \n",
"1 316 4 67.35 144 62 18182 46.228116 -111.718928 \n",
"2 383 8 1342.69 407 546 1312 34.898759 -88.125374 \n",
"3 422 10 28.45 380 19 42384 41.170642 -111.052342 \n",
"4 26 7 45.49 54 895 516 47.489127 -95.926267 \n",
"\n",
" hour age is_fraud \n",
"0 18 47 0 \n",
"1 8 44 0 \n",
"2 1 64 0 \n",
"3 15 26 0 \n",
"4 20 53 0 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_selected = df[selected_features.tolist() + ['is_fraud']]\n",
"df_selected.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "a91c3352",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"is_fraud\n",
"0 9651\n",
"1 9651\n",
"Name: count, dtype: int64"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['is_fraud'].value_counts()\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "959821de",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"../data/processed/processed_data.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e8269571",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}