569 lines
18 KiB
Plaintext
569 lines
18 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Feature Engineering for Fraud Detection"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"This notebook focuses on transforming the raw transaction data into meaningful features for fraud detection. We'll create new features, handle categorical variables, and prepare the data for model training."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Import necessary libraries\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"from datetime import datetime\n",
|
|
"from geopy.distance import geodesic\n",
|
|
"import os\n",
|
|
"import sys\n",
|
|
"\n",
|
|
"# Set plot style\n",
|
|
"plt.style.use('seaborn-v0_8-whitegrid')\n",
|
|
"sns.set(font_scale=1.2)\n",
|
|
"\n",
|
|
"# Configure plot size\n",
|
|
"plt.rcParams['figure.figsize'] = (12, 8)\n",
|
|
"\n",
|
|
"# Display all columns\n",
|
|
"pd.set_option('display.max_columns', None)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Add the project root to the path so we can import from src\n",
|
|
"sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))\n",
|
|
"from src import config"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Load the Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load training data\n",
|
|
"train_data = pd.read_csv(config.TRAIN_DATA_PATH)\n",
|
|
"\n",
|
|
"# Load test data\n",
|
|
"test_data = pd.read_csv(config.TEST_DATA_PATH)\n",
|
|
"\n",
|
|
"print(f'Training data shape: {train_data.shape}')\n",
|
|
"print(f'Test data shape: {test_data.shape}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Display the first few rows of the training data\n",
|
|
"train_data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 1. Time-Based Features"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Let's extract time-based features from the transaction timestamp. These features can help identify patterns in fraudulent transactions based on when they occur."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Convert transaction time to datetime\n",
|
|
"train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])\n",
|
|
"\n",
|
|
"# Extract time-based features\n",
|
|
"train_data['hour'] = train_data['trans_date_trans_time'].dt.hour\n",
|
|
"train_data['day'] = train_data['trans_date_trans_time'].dt.day\n",
|
|
"train_data['weekday'] = train_data['trans_date_trans_time'].dt.dayofweek\n",
|
|
"train_data['month'] = train_data['trans_date_trans_time'].dt.month\n",
|
|
"train_data['year'] = train_data['trans_date_trans_time'].dt.year\n",
|
|
"\n",
|
|
"# Create is_weekend feature\n",
|
|
"train_data['is_weekend'] = train_data['weekday'].apply(lambda x: 1 if x >= 5 else 0)\n",
|
|
"\n",
|
|
"# Create time of day categories\n",
|
|
"train_data['time_of_day'] = train_data['hour'].apply(lambda x: \n",
|
|
" 'night' if 0 <= x < 6 else\n",
|
|
" 'morning' if 6 <= x < 12 else\n",
|
|
" 'afternoon' if 12 <= x < 18 else\n",
|
|
" 'evening')\n",
|
|
"\n",
|
|
"# Display the new features\n",
|
|
"train_data[['trans_date_trans_time', 'hour', 'day', 'weekday', 'month', 'year', 'is_weekend', 'time_of_day']].head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Let's analyze the relationship between these time-based features and fraud."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Analyze fraud by hour of day\n",
|
|
"hour_fraud = train_data.groupby('hour')['is_fraud'].mean().reset_index()\n",
|
|
"hour_fraud.columns = ['Hour', 'Fraud Rate']\n",
|
|
"\n",
|
|
"plt.figure(figsize=(12, 6))\n",
|
|
"sns.lineplot(x='Hour', y='Fraud Rate', data=hour_fraud, marker='o')\n",
|
|
"plt.title('Fraud Rate by Hour of Day')\n",
|
|
"plt.xlabel('Hour of Day')\n",
|
|
"plt.ylabel('Fraud Rate')\n",
|
|
"plt.grid(True)\n",
|
|
"\n",
|
|
"# Add percentage labels\n",
|
|
"for i, rate in enumerate(hour_fraud['Fraud Rate']):\n",
|
|
" plt.text(i, rate + 0.0005, f'{rate:.2%}', ha='center', fontsize=9)\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Analyze fraud by day of week\n",
|
|
"train_data['day_name'] = train_data['trans_date_trans_time'].dt.day_name()\n",
|
|
"\n",
|
|
"day_fraud = train_data.groupby(['weekday', 'day_name'])['is_fraud'].mean().reset_index()\n",
|
|
"day_fraud.columns = ['Weekday', 'Day Name', 'Fraud Rate']\n",
|
|
"day_fraud = day_fraud.sort_values('Weekday') # Sort by weekday number\n",
|
|
"\n",
|
|
"plt.figure(figsize=(12, 6))\n",
|
|
"sns.barplot(x='Day Name', y='Fraud Rate', data=day_fraud)\n",
|
|
"plt.title('Fraud Rate by Day of Week')\n",
|
|
"plt.xlabel('Day of Week')\n",
|
|
"plt.ylabel('Fraud Rate')\n",
|
|
"\n",
|
|
"# Add percentage labels\n",
|
|
"for i, rate in enumerate(day_fraud['Fraud Rate']):\n",
|
|
" plt.text(i, rate + 0.0005, f'{rate:.2%}', ha='center', fontsize=10)\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Distance Calculation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"The distance between the cardholder and the merchant can be a strong indicator of fraud. Let's calculate this distance using the latitude and longitude coordinates."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def calculate_distance(row):\n",
|
|
" \"\"\"\n",
|
|
" Calculate the distance between the cardholder and merchant in kilometers\n",
|
|
" \"\"\"\n",
|
|
" try:\n",
|
|
" cardholder_coords = (row['lat'], row['long'])\n",
|
|
" merchant_coords = (row['merch_lat'], row['merch_long'])\n",
|
|
" return geodesic(cardholder_coords, merchant_coords).kilometers\n",
|
|
" except:\n",
|
|
" return np.nan\n",
|
|
"\n",
|
|
"# Calculate distance for a sample of the data (for performance)\n",
|
|
"sample_data = train_data.sample(n=10000, random_state=42)\n",
|
|
"sample_data['distance_km'] = sample_data.apply(calculate_distance, axis=1)\n",
|
|
"\n",
|
|
"# Display the distance feature\n",
|
|
"sample_data[['lat', 'long', 'merch_lat', 'merch_long', 'distance_km']].head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Analyze distance vs. fraud\n",
|
|
"plt.figure(figsize=(12, 6))\n",
|
|
"sns.boxplot(x='is_fraud', y='distance_km', data=sample_data)\n",
|
|
"plt.title('Distance Between Cardholder and Merchant by Fraud Status')\n",
|
|
"plt.xlabel('Is Fraud (1 = Yes, 0 = No)')\n",
|
|
"plt.ylabel('Distance (km)')\n",
|
|
"plt.ylim(0, 5000) # Limit y-axis for better visualization\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 3. Age Calculation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Let's calculate the age of the cardholder at the time of the transaction."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Convert DOB to datetime\n",
|
|
"train_data['dob'] = pd.to_datetime(train_data['dob'])\n",
|
|
"\n",
|
|
"# Calculate age at the time of transaction\n",
|
|
"train_data['age'] = train_data.apply(lambda row: (row['trans_date_trans_time'].year - row['dob'].year) - \n",
|
|
" ((row['trans_date_trans_time'].month, row['trans_date_trans_time'].day) < \n",
|
|
" (row['dob'].month, row['dob'].day)), axis=1)\n",
|
|
"\n",
|
|
"# Display the age feature\n",
|
|
"train_data[['dob', 'trans_date_trans_time', 'age']].head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create age groups\n",
|
|
"bins = [0, 18, 25, 35, 45, 55, 65, 100]\n",
|
|
"labels = ['<18', '18-25', '26-35', '36-45', '46-55', '56-65', '65+']\n",
|
|
"train_data['age_group'] = pd.cut(train_data['age'], bins=bins, labels=labels)\n",
|
|
"\n",
|
|
"# Analyze fraud by age group\n",
|
|
"age_fraud = train_data.groupby('age_group')['is_fraud'].mean().reset_index()\n",
|
|
"age_fraud.columns = ['Age Group', 'Fraud Rate']\n",
|
|
"\n",
|
|
"plt.figure(figsize=(12, 6))\n",
|
|
"sns.barplot(x='Age Group', y='Fraud Rate', data=age_fraud)\n",
|
|
"plt.title('Fraud Rate by Age Group')\n",
|
|
"plt.xlabel('Age Group')\n",
|
|
"plt.ylabel('Fraud Rate')\n",
|
|
"\n",
|
|
"# Add percentage labels\n",
|
|
"for i, rate in enumerate(age_fraud['Fraud Rate']):\n",
|
|
" plt.text(i, rate + 0.0005, f'{rate:.2%}', ha='center', fontsize=10)\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 4. Transaction Amount Features"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Let's create features related to transaction amounts, such as the transaction amount relative to the average for that category."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Calculate average transaction amount by category\n",
|
|
"category_avg = train_data.groupby('category')['amt'].mean().to_dict()\n",
|
|
"\n",
|
|
"# Create feature for transaction amount relative to category average\n",
|
|
"train_data['amt_to_category_avg'] = train_data.apply(\n",
|
|
" lambda row: row['amt'] / category_avg.get(row['category'], 1), axis=1)\n",
|
|
"\n",
|
|
"# Display the new feature\n",
|
|
"train_data[['category', 'amt', 'amt_to_category_avg']].head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Analyze the relationship between amt_to_category_avg and fraud\n",
|
|
"plt.figure(figsize=(12, 6))\n",
|
|
"sns.boxplot(x='is_fraud', y='amt_to_category_avg', data=train_data)\n",
|
|
"plt.title('Transaction Amount Relative to Category Average by Fraud Status')\n",
|
|
"plt.xlabel('Is Fraud (1 = Yes, 0 = No)')\n",
|
|
"plt.ylabel('Amount / Category Average')\n",
|
|
"plt.ylim(0, 10) # Limit y-axis for better visualization\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 5. Handling Categorical Features"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Let's identify and prepare categorical features for model training."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Identify categorical columns\n",
|
|
"categorical_cols = train_data.select_dtypes(include=['object', 'category']).columns.tolist()\n",
|
|
"print(f'Categorical columns: {categorical_cols}')\n",
|
|
"\n",
|
|
"# For demonstration, let's look at the category feature\n",
|
|
"category_counts = train_data['category'].value_counts()\n",
|
|
"print(f'\nNumber of unique categories: {len(category_counts)}')\n",
|
|
"print('\nTop 10 categories:')\n",
|
|
"print(category_counts.head(10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Analyze fraud rate by category\n",
|
|
"category_fraud = train_data.groupby('category')['is_fraud'].mean().sort_values(ascending=False).reset_index()\n",
|
|
"category_fraud.columns = ['Category', 'Fraud Rate']\n",
|
|
"\n",
|
|
"plt.figure(figsize=(12, 8))\n",
|
|
"sns.barplot(x='Fraud Rate', y='Category', data=category_fraud)\n",
|
|
"plt.title('Fraud Rate by Transaction Category')\n",
|
|
"plt.xlabel('Fraud Rate')\n",
|
|
"plt.ylabel('Category')\n",
|
|
"\n",
|
|
"# Add percentage labels\n",
|
|
"for i, rate in enumerate(category_fraud['Fraud Rate']):\n",
|
|
" plt.text(rate + 0.001, i, f'{rate:.2%}', va='center', fontsize=10)\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 6. Feature Selection"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Let's select the most relevant features for our fraud detection model."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Select features for model\n",
|
|
"feature_cols = [\n",
|
|
" 'amt', 'distance_km', 'age', 'hour', 'day', 'weekday', 'month',\n",
|
|
" 'is_weekend', 'amt_to_category_avg', 'city_pop', 'category', 'time_of_day'\n",
|
|
"]\n",
|
|
"\n",
|
|
"# Create the final dataset for model training\n",
|
|
"final_data = train_data[feature_cols + ['is_fraud']]\n",
|
|
"\n",
|
|
"# Display the final dataset\n",
|
|
"final_data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 7. Save Processed Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Save the processed training data\n",
|
|
"final_data.to_csv(config.PROCESSED_TRAIN_DATA_PATH, index=False)\n",
|
|
"print(f'Processed training data saved to {config.PROCESSED_TRAIN_DATA_PATH}')\n",
|
|
"\n",
|
|
"# Save the category averages for use during prediction\n",
|
|
"category_avg_df = pd.DataFrame(list(category_avg.items()), columns=['category', 'amt'])\n",
|
|
"category_avg_df.to_csv(config.PROCESSED_DATA_DIR / 'category_avg.csv', index=False)\n",
|
|
"print(f'Category averages saved to {os.path.join(config.PROCESSED_DATA_DIR, 'category_avg.csv')}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 8. Process Test Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Apply the same preprocessing steps to the test data\n",
|
|
"# Convert transaction time to datetime\n",
|
|
"test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])\n",
|
|
"\n",
|
|
"# Extract time-based features\n",
|
|
"test_data['hour'] = test_data['trans_date_trans_time'].dt.hour\n",
|
|
"test_data['day'] = test_data['trans_date_trans_time'].dt.day\n",
|
|
"test_data['weekday'] = test_data['trans_date_trans_time'].dt.dayofweek\n",
|
|
"test_data['month'] = test_data['trans_date_trans_time'].dt.month\n",
|
|
"test_data['year'] = test_data['trans_date_trans_time'].dt.year\n",
|
|
"test_data['is_weekend'] = test_data['weekday'].apply(lambda x: 1 if x >= 5 else 0)\n",
|
|
"test_data['time_of_day'] = test_data['hour'].apply(lambda x: \n",
|
|
" 'night' if 0 <= x < 6 else\n",
|
|
" 'morning' if 6 <= x < 12 else\n",
|
|
" 'afternoon' if 12 <= x < 18 else\n",
|
|
" 'evening')\n",
|
|
"\n",
|
|
"# Calculate distance\n",
|
|
"test_data['distance_km'] = test_data.apply(calculate_distance, axis=1)\n",
|
|
"\n",
|
|
"# Calculate age\n",
|
|
"test_data['dob'] = pd.to_datetime(test_data['dob'])\n",
|
|
"test_data['age'] = test_data.apply(lambda row: (row['trans_date_trans_time'].year - row['dob'].year) - \n",
|
|
" ((row['trans_date_trans_time'].month, row['trans_date_trans_time'].day) < \n",
|
|
" (row['dob'].month, row['dob'].day)), axis=1)\n",
|
|
"\n",
|
|
"# Create feature for transaction amount relative to category average\n",
|
|
"test_data['amt_to_category_avg'] = test_data.apply(\n",
|
|
" lambda row: row['amt'] / category_avg.get(row['category'], 1), axis=1)\n",
|
|
"\n",
|
|
"# Select the same features\n",
|
|
"final_test_data = test_data[feature_cols + ['is_fraud']]\n",
|
|
"\n",
|
|
"# Save the processed test data\n",
|
|
"final_test_data.to_csv(config.PROCESSED_TEST_DATA_PATH, index=False)\n",
|
|
"print(f'Processed test data saved to {config.PROCESSED_TEST_DATA_PATH}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Summary"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"In this notebook, we performed feature engineering on the transaction data for fraud detection:\n",
|
|
"\n",
|
|
"1. **Time-Based Features**: Extracted hour, day, weekday, month, year, is_weekend, and time_of_day from the transaction timestamp.\n",
|
|
"\n",
|
|
"2. **Distance Calculation**: Calculated the distance between the cardholder and merchant locations.\n",
|
|
"\n",
|
|
"3. **Age Calculation**: Derived the age of the cardholder at the time of the transaction.\n",
|
|
"\n",
|
|
"4. **Transaction Amount Features**: Created a feature for transaction amount relative to the category average.\n",
|
|
"\n",
|
|
"5. **Categorical Features**: Identified and analyzed categorical features like category.\n",
|
|
"\n",
|
|
"6. **Feature Selection**: Selected the most relevant features for the model.\n",
|
|
"\n",
|
|
"7. **Data Saving**: Saved the processed data for model training.\n",
|
|
"\n",
|
|
"These engineered features will help improve the performance of our fraud detection model by providing more meaningful information about the transactions."
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.10"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|