50e95445fb
Defined file structure and completed EDA
157 lines
5.9 KiB
Plaintext
157 lines
5.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# feature_engineering_experiments.ipynb\n",
|
|
"\n",
|
|
"# Import libraries\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from datetime import datetime\n",
|
|
"\n",
|
|
"# Load data\n",
|
|
"df = pd.read_csv('../data/raw/fraudTrain.csv')\n",
|
|
"\n",
|
|
"# Basic preprocessing\n",
|
|
"df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])\n",
|
|
"df['dob'] = pd.to_datetime(df['dob'])\n",
|
|
"\n",
|
|
"# Experiment 1: Basic Features\n",
|
|
"def create_basic_features(df):\n",
|
|
" # Time-based features\n",
|
|
" df['hour'] = df['trans_date_trans_time'].dt.hour\n",
|
|
" df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek\n",
|
|
" df['month'] = df['trans_date_trans_time'].dt.month\n",
|
|
" \n",
|
|
" # Age feature\n",
|
|
" df['dob'] = pd.to_datetime(df['dob'])\n",
|
|
" reference_date = pd.to_datetime('2020-06-21')\n",
|
|
" df['age'] = (reference_date - df['dob']).dt.days // 365\n",
|
|
" \n",
|
|
" # Distance between merchant and customer\n",
|
|
" df['distance'] = np.sqrt((df['merch_lat'] - df['lat'])**2 + (df['merch_long'] - df['long'])**2)\n",
|
|
" \n",
|
|
" # Categorical encoding\n",
|
|
" cat_cols = ['category', 'gender', 'state']\n",
|
|
" for col in cat_cols:\n",
|
|
" le = LabelEncoder()\n",
|
|
" df[col+'_encoded'] = le.fit_transform(df[col])\n",
|
|
" \n",
|
|
" return df\n",
|
|
"\n",
|
|
"# Experiment 2: Transaction Patterns\n",
|
|
"def create_transaction_patterns(df):\n",
|
|
" # Transaction frequency per customer\n",
|
|
" trans_count = df.groupby('cc_num')['trans_num'].count().reset_index()\n",
|
|
" trans_count.columns = ['cc_num', 'trans_count']\n",
|
|
" df = df.merge(trans_count, on='cc_num', how='left')\n",
|
|
" \n",
|
|
" # Average transaction amount per customer\n",
|
|
" avg_amount = df.groupby('cc_num')['amt'].mean().reset_index()\n",
|
|
" avg_amount.columns = ['cc_num', 'avg_trans_amount']\n",
|
|
" df = df.merge(avg_amount, on='cc_num', how='left')\n",
|
|
" \n",
|
|
" # Difference from average amount\n",
|
|
" df['amt_diff_from_avg'] = df['amt'] - df['avg_trans_amount']\n",
|
|
" \n",
|
|
" return df\n",
|
|
"\n",
|
|
"# Experiment 3: Time-based Features\n",
|
|
"def create_time_features(df):\n",
|
|
" # Time since last transaction\n",
|
|
" df = df.sort_values(['cc_num', 'trans_date_trans_time'])\n",
|
|
" df['time_since_last'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds() / 60\n",
|
|
" \n",
|
|
" # Fill NA for first transactions\n",
|
|
" df['time_since_last'] = df['time_since_last'].fillna(24*60) # Assume 24 hours if first transaction\n",
|
|
" \n",
|
|
" # Transaction velocity (transactions per hour)\n",
|
|
" df['trans_velocity'] = 60 / df['time_since_last'] # transactions per hour\n",
|
|
" \n",
|
|
" return df\n",
|
|
"\n",
|
|
"# Experiment 4: Merchant Behavior\n",
|
|
"def create_merchant_features(df):\n",
|
|
" # Merchant transaction count\n",
|
|
" merchant_counts = df['merchant'].value_counts().reset_index()\n",
|
|
" merchant_counts.columns = ['merchant', 'merchant_trans_count']\n",
|
|
" df = df.merge(merchant_counts, on='merchant', how='left')\n",
|
|
" \n",
|
|
" # Merchant fraud rate\n",
|
|
" merchant_fraud = df.groupby('merchant')['is_fraud'].mean().reset_index()\n",
|
|
" merchant_fraud.columns = ['merchant', 'merchant_fraud_rate']\n",
|
|
" df = df.merge(merchant_fraud, on='merchant', how='left')\n",
|
|
" \n",
|
|
" return df\n",
|
|
"\n",
|
|
"# Apply all feature engineering steps\n",
|
|
"df_features = create_basic_features(df)\n",
|
|
"df_features = create_transaction_patterns(df_features)\n",
|
|
"df_features = create_time_features(df_features)\n",
|
|
"df_features = create_merchant_features(df_features)\n",
|
|
"\n",
|
|
"# Select final features\n",
|
|
"features = ['amt', 'hour', 'day_of_week', 'month', 'age', 'distance',\n",
|
|
" 'category_encoded', 'gender_encoded', 'state_encoded',\n",
|
|
" 'trans_count', 'avg_trans_amount', 'amt_diff_from_avg',\n",
|
|
" 'time_since_last', 'trans_velocity', 'merchant_trans_count',\n",
|
|
" 'merchant_fraud_rate', 'city_pop']\n",
|
|
"\n",
|
|
"X = df_features[features]\n",
|
|
"y = df_features['is_fraud']\n",
|
|
"\n",
|
|
"# Split data\n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)\n",
|
|
"\n",
|
|
"X_train.replace([np.inf, -np.inf], np.nan, inplace=True)\n",
|
|
"X_test.replace([np.inf, -np.inf], np.nan, inplace=True)\n",
|
|
"X_train.dropna(inplace=True)\n",
|
|
"# Scale numerical features\n",
|
|
"scaler = StandardScaler()\n",
|
|
"X_train_scaled = scaler.fit_transform(X_train)\n",
|
|
"X_test_scaled = scaler.transform(X_test)\n",
|
|
"\n",
|
|
"# Save processed data for modeling\n",
|
|
"pd.DataFrame(X_train_scaled, columns=features).to_csv('X_train.csv', index=False)\n",
|
|
"pd.DataFrame(X_test_scaled, columns=features).to_csv('X_test.csv', index=False)\n",
|
|
"y_train.to_csv('y_train.csv', index=False)\n",
|
|
"y_test.to_csv('y_test.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|