{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# feature_engineering_experiments.ipynb\n", "\n", "# Import libraries\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", "from sklearn.model_selection import train_test_split\n", "from datetime import datetime\n", "\n", "# Load data\n", "df = pd.read_csv('../data/raw/fraudTrain.csv')\n", "\n", "# Basic preprocessing\n", "df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])\n", "df['dob'] = pd.to_datetime(df['dob'])\n", "\n", "# Experiment 1: Basic Features\n", "def create_basic_features(df):\n", " # Time-based features\n", " df['hour'] = df['trans_date_trans_time'].dt.hour\n", " df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek\n", " df['month'] = df['trans_date_trans_time'].dt.month\n", " \n", " # Age feature\n", " df['dob'] = pd.to_datetime(df['dob'])\n", " reference_date = pd.to_datetime('2020-06-21')\n", " df['age'] = (reference_date - df['dob']).dt.days // 365\n", " \n", " # Distance between merchant and customer\n", " df['distance'] = np.sqrt((df['merch_lat'] - df['lat'])**2 + (df['merch_long'] - df['long'])**2)\n", " \n", " # Categorical encoding\n", " cat_cols = ['category', 'gender', 'state']\n", " for col in cat_cols:\n", " le = LabelEncoder()\n", " df[col+'_encoded'] = le.fit_transform(df[col])\n", " \n", " return df\n", "\n", "# Experiment 2: Transaction Patterns\n", "def create_transaction_patterns(df):\n", " # Transaction frequency per customer\n", " trans_count = df.groupby('cc_num')['trans_num'].count().reset_index()\n", " trans_count.columns = ['cc_num', 'trans_count']\n", " df = df.merge(trans_count, on='cc_num', how='left')\n", " \n", " # Average transaction amount per customer\n", " avg_amount = df.groupby('cc_num')['amt'].mean().reset_index()\n", " avg_amount.columns = ['cc_num', 'avg_trans_amount']\n", " df = df.merge(avg_amount, on='cc_num', how='left')\n", " \n", " # Difference from average amount\n", " df['amt_diff_from_avg'] = df['amt'] - df['avg_trans_amount']\n", " \n", " return df\n", "\n", "# Experiment 3: Time-based Features\n", "def create_time_features(df):\n", " # Time since last transaction\n", " df = df.sort_values(['cc_num', 'trans_date_trans_time'])\n", " df['time_since_last'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds() / 60\n", " \n", " # Fill NA for first transactions\n", " df['time_since_last'] = df['time_since_last'].fillna(24*60) # Assume 24 hours if first transaction\n", " \n", " # Transaction velocity (transactions per hour)\n", " df['trans_velocity'] = 60 / df['time_since_last'] # transactions per hour\n", " \n", " return df\n", "\n", "# Experiment 4: Merchant Behavior\n", "def create_merchant_features(df):\n", " # Merchant transaction count\n", " merchant_counts = df['merchant'].value_counts().reset_index()\n", " merchant_counts.columns = ['merchant', 'merchant_trans_count']\n", " df = df.merge(merchant_counts, on='merchant', how='left')\n", " \n", " # Merchant fraud rate\n", " merchant_fraud = df.groupby('merchant')['is_fraud'].mean().reset_index()\n", " merchant_fraud.columns = ['merchant', 'merchant_fraud_rate']\n", " df = df.merge(merchant_fraud, on='merchant', how='left')\n", " \n", " return df\n", "\n", "# Apply all feature engineering steps\n", "df_features = create_basic_features(df)\n", "df_features = create_transaction_patterns(df_features)\n", "df_features = create_time_features(df_features)\n", "df_features = create_merchant_features(df_features)\n", "\n", "# Select final features\n", "features = ['amt', 'hour', 'day_of_week', 'month', 'age', 'distance',\n", " 'category_encoded', 'gender_encoded', 'state_encoded',\n", " 'trans_count', 'avg_trans_amount', 'amt_diff_from_avg',\n", " 'time_since_last', 'trans_velocity', 'merchant_trans_count',\n", " 'merchant_fraud_rate', 'city_pop']\n", "\n", "X = df_features[features]\n", "y = df_features['is_fraud']\n", "\n", "# Split data\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)\n", "\n", "X_train.replace([np.inf, -np.inf], np.nan, inplace=True)\n", "X_test.replace([np.inf, -np.inf], np.nan, inplace=True)\n", "X_train.dropna(inplace=True)\n", "# Scale numerical features\n", "scaler = StandardScaler()\n", "X_train_scaled = scaler.fit_transform(X_train)\n", "X_test_scaled = scaler.transform(X_test)\n", "\n", "# Save processed data for modeling\n", "pd.DataFrame(X_train_scaled, columns=features).to_csv('X_train.csv', index=False)\n", "pd.DataFrame(X_test_scaled, columns=features).to_csv('X_test.csv', index=False)\n", "y_train.to_csv('y_train.csv', index=False)\n", "y_test.to_csv('y_test.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }