import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline import joblib from datetime import datetime def load_data(train_file, test_file): train_df = pd.read_csv(train_file) test_df = pd.read_csv(test_file) # Drop rows with missing values train_df.dropna(inplace=True) test_df.dropna(inplace=True) return train_df, test_df def feature_engineering(df): # Convert transaction time to datetime df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) # Extract time features df['hour'] = df['trans_date_trans_time'].dt.hour df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek df['month'] = df['trans_date_trans_time'].dt.month # Calculate age from dob df['dob'] = pd.to_datetime(df['dob']) df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365 # Calculate distance between user and merchant df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2) return df def train_model(train_df, test_df): # Define features and target X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1) y_train = train_df['is_fraud'] X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1) y_test = test_df['is_fraud'] # Define preprocessing numeric_features = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'] numeric_transformer = StandardScaler() categorical_features = ['category', 'gender', 'job', 'merchant'] categorical_transformer = OneHotEncoder(handle_unknown='ignore') preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ]) # Create pipeline model = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42)) ]) # Train model model.fit(X_train, y_train) # Evaluate on test data y_pred = model.predict(X_test) print("Test Set Performance:") print(classification_report(y_test, y_pred)) return model def main(): # Load data train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv') # Feature engineering train_df = feature_engineering(train_df) test_df = feature_engineering(test_df) # Print dataset sizes after cleaning print(f"Training samples after cleaning: {len(train_df)}") print(f"Test samples after cleaning: {len(test_df)}") # Train model model = train_model(train_df, test_df) # Save model joblib.dump(model, 'models/fraud_model.pkl') print("Model saved to models/fraud_model.pkl") if __name__ == "__main__": main()