2025-07-25 20:10:44 +01:00
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
|
|
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
|
|
from sklearn.metrics import classification_report
|
|
|
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
|
|
|
from sklearn.compose import ColumnTransformer
|
|
|
|
|
from sklearn.pipeline import Pipeline
|
|
|
|
|
import joblib
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_data(train_file, test_file):
|
|
|
|
|
train_df = pd.read_csv(train_file)
|
|
|
|
|
test_df = pd.read_csv(test_file)
|
|
|
|
|
|
|
|
|
|
# Drop rows with missing values
|
|
|
|
|
train_df.dropna(inplace=True)
|
|
|
|
|
test_df.dropna(inplace=True)
|
|
|
|
|
|
|
|
|
|
return train_df, test_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def feature_engineering(df):
|
|
|
|
|
# Convert transaction time to datetime
|
|
|
|
|
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
|
|
|
|
|
|
|
|
|
# Extract time features
|
|
|
|
|
df['hour'] = df['trans_date_trans_time'].dt.hour
|
|
|
|
|
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
|
|
|
|
|
df['month'] = df['trans_date_trans_time'].dt.month
|
|
|
|
|
|
|
|
|
|
# Calculate age from dob
|
|
|
|
|
df['dob'] = pd.to_datetime(df['dob'])
|
|
|
|
|
df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
|
|
|
|
|
|
|
|
|
|
# Calculate distance between user and merchant
|
|
|
|
|
df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
|
|
|
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_model(train_df, test_df):
|
|
|
|
|
# Define features and target
|
|
|
|
|
X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
|
|
|
|
|
y_train = train_df['is_fraud']
|
|
|
|
|
|
|
|
|
|
X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
|
|
|
|
|
y_test = test_df['is_fraud']
|
|
|
|
|
|
|
|
|
|
# Define preprocessing
|
|
|
|
|
numeric_features = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance']
|
|
|
|
|
numeric_transformer = StandardScaler()
|
|
|
|
|
|
|
|
|
|
categorical_features = ['category', 'gender', 'job', 'merchant']
|
|
|
|
|
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
|
|
|
|
|
|
|
|
|
|
preprocessor = ColumnTransformer(
|
|
|
|
|
transformers=[
|
|
|
|
|
('num', numeric_transformer, numeric_features),
|
|
|
|
|
('cat', categorical_transformer, categorical_features)
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
# Create pipeline
|
|
|
|
|
model = Pipeline(steps=[
|
|
|
|
|
('preprocessor', preprocessor),
|
|
|
|
|
('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
# Train model
|
|
|
|
|
model.fit(X_train, y_train)
|
|
|
|
|
|
|
|
|
|
# Evaluate on test data
|
|
|
|
|
y_pred = model.predict(X_test)
|
|
|
|
|
print("Test Set Performance:")
|
|
|
|
|
print(classification_report(y_test, y_pred))
|
|
|
|
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
# Load data
|
|
|
|
|
train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv')
|
|
|
|
|
|
|
|
|
|
# Feature engineering
|
|
|
|
|
train_df = feature_engineering(train_df)
|
|
|
|
|
test_df = feature_engineering(test_df)
|
|
|
|
|
|
|
|
|
|
# Print dataset sizes after cleaning
|
|
|
|
|
print(f"Training samples after cleaning: {len(train_df)}")
|
|
|
|
|
print(f"Test samples after cleaning: {len(test_df)}")
|
|
|
|
|
|
|
|
|
|
# Train model
|
|
|
|
|
model = train_model(train_df, test_df)
|
|
|
|
|
|
|
|
|
|
# Save model
|
|
|
|
|
joblib.dump(model, 'models/fraud_model.pkl')
|
|
|
|
|
print("Model saved to models/fraud_model.pkl")
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|