{ "cells": [ { "cell_type": "markdown", "id": "7f073598", "metadata": {}, "source": [ "# Feature Engineering and Feature Selection for Fraud Detection" ] }, { "cell_type": "code", "execution_count": null, "id": "1816729a", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", "from sklearn.feature_selection import VarianceThreshold, RFE\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import train_test_split\n", "from imblearn.under_sampling import RandomUnderSampler" ] }, { "cell_type": "code", "execution_count": null, "id": "5309a15c", "metadata": {}, "outputs": [], "source": [ "df_train = pd.read_csv('../data/raw/fraudTrain.csv')\n", "df_test = pd.read_csv('../data/raw/fraudTest.csv')\n", "\n", "df = pd.concat([df_train, df_test])\n", "X = df.drop(['is_fraud'], axis=1)\n", "y = df['is_fraud']" ] }, { "cell_type": "code", "execution_count": null, "id": "dd2b4e07", "metadata": {}, "outputs": [], "source": [ "rus = RandomUnderSampler(sampling_strategy=1)\n", "X_res, y_res = rus.fit_resample(X, y)\n", "\n", "# Combine X_res and y_res into a single DataFrame\n", "resampled_df = pd.concat([X_res, y_res], axis=1)\n", "\n", "# Save the combined DataFrame to a new file\n", "resampled_df.to_csv('../data/processed/resampled_data.csv', index=False)\n", "\n", "ax = y_res.value_counts().plot.pie(autopct='%.2f')\n", "_ = ax.set_title(\"Under-sampling\")" ] }, { "cell_type": "code", "execution_count": null, "id": "890be8a6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | trans_date_trans_time | \n", "cc_num | \n", "merchant | \n", "category | \n", "amt | \n", "first | \n", "last | \n", "gender | \n", "street | \n", "city | \n", "... | \n", "lat | \n", "long | \n", "city_pop | \n", "job | \n", "dob | \n", "trans_num | \n", "unix_time | \n", "merch_lat | \n", "merch_long | \n", "is_fraud | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "2020-12-15 18:33:17 | \n", "30551643947183 | \n", "fraud_Dickinson Ltd | \n", "personal_care | \n", "6.15 | \n", "Morgan | \n", "Smith | \n", "F | \n", "1441 Bradley Place | \n", "Grover | \n", "... | \n", "35.1836 | \n", "-81.4552 | \n", "5621 | \n", "Toxicologist | \n", "1973-11-14 | \n", "7cab35c172dc6c78f551e981fe426fc6 | \n", "1387132397 | \n", "35.292860 | \n", "-81.937193 | \n", "0 | \n", "
| 1 | \n", "2019-12-23 08:14:17 | \n", "5359543825610251 | \n", "fraud_Kilback LLC | \n", "grocery_pos | \n", "67.35 | \n", "Michael | \n", "Francis | \n", "M | \n", "1833 Jeanette Stravenue | \n", "Belgrade | \n", "... | \n", "45.7801 | \n", "-111.1439 | \n", "18182 | \n", "Engineer, drilling | \n", "1975-06-29 | \n", "ea272415875381d29cf29aee11550672 | \n", "1356250457 | \n", "46.228116 | \n", "-111.718928 | \n", "0 | \n", "
| 2 | \n", "2019-07-22 01:42:56 | \n", "3560318482131952 | \n", "fraud_Lehner, Mosciski and King | \n", "misc_net | \n", "1342.69 | \n", "William | \n", "Skinner | \n", "M | \n", "524 Wu Spurs Suite 894 | \n", "Mount Hope | \n", "... | \n", "34.4793 | \n", "-87.4769 | \n", "1312 | \n", "Librarian, academic | \n", "1955-02-01 | \n", "964730a13ec2011d75a3db37c55e526d | \n", "1342921376 | \n", "34.898759 | \n", "-88.125374 | \n", "0 | \n", "
| 3 | \n", "2019-08-05 15:21:55 | \n", "4477156602511939689 | \n", "fraud_Medhurst, Cartwright and Ebert | \n", "personal_care | \n", "28.45 | \n", "Angela | \n", "Ross | \n", "F | \n", "0107 Clements Point | \n", "American Fork | \n", "... | \n", "40.3928 | \n", "-111.7941 | \n", "42384 | \n", "Futures trader | \n", "1992-12-29 | \n", "518f49a0d3106c3f538c2f70d2f12e8a | \n", "1344180115 | \n", "41.170642 | \n", "-111.052342 | \n", "0 | \n", "
| 4 | \n", "2020-01-27 20:49:30 | \n", "30175986190993 | \n", "fraud_Barton LLC | \n", "kids_pets | \n", "45.49 | \n", "Rebecca | \n", "Butler | \n", "F | \n", "0665 Lisa Alley | \n", "Winger | \n", "... | \n", "47.5375 | \n", "-95.9941 | \n", "516 | \n", "Applications developer | \n", "1966-06-07 | \n", "8979eac3cefcacee095981dc43cd125d | \n", "1359319770 | \n", "47.489127 | \n", "-95.926267 | \n", "0 | \n", "
5 rows × 22 columns
\n", "| \n", " | merchant | \n", "category | \n", "amt | \n", "last | \n", "city | \n", "city_pop | \n", "merch_lat | \n", "merch_long | \n", "hour | \n", "age | \n", "is_fraud | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "136 | \n", "10 | \n", "6.15 | \n", "409 | \n", "317 | \n", "5621 | \n", "35.292860 | \n", "-81.937193 | \n", "18 | \n", "47 | \n", "0 | \n", "
| 1 | \n", "316 | \n", "4 | \n", "67.35 | \n", "144 | \n", "62 | \n", "18182 | \n", "46.228116 | \n", "-111.718928 | \n", "8 | \n", "44 | \n", "0 | \n", "
| 2 | \n", "383 | \n", "8 | \n", "1342.69 | \n", "407 | \n", "546 | \n", "1312 | \n", "34.898759 | \n", "-88.125374 | \n", "1 | \n", "64 | \n", "0 | \n", "
| 3 | \n", "422 | \n", "10 | \n", "28.45 | \n", "380 | \n", "19 | \n", "42384 | \n", "41.170642 | \n", "-111.052342 | \n", "15 | \n", "26 | \n", "0 | \n", "
| 4 | \n", "26 | \n", "7 | \n", "45.49 | \n", "54 | \n", "895 | \n", "516 | \n", "47.489127 | \n", "-95.926267 | \n", "20 | \n", "53 | \n", "0 | \n", "