Files
2025-07-18 22:05:55 +01:00

141 lines
3.7 KiB
Plaintext

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"import joblib"
],
"metadata": {
"id": "a1U4QR1AOKAK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Load processed data\n",
"train_df = pd.read_csv('train_processed.csv')\n",
"test_df = pd.read_csv('test_processed.csv')"
],
"metadata": {
"id": "mOjH5CqhOlTp"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Prepare features and target\n",
"X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)\n",
"y_train = train_df['is_fraud']\n",
"\n",
"X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)\n",
"y_test = test_df['is_fraud']\n"
],
"metadata": {
"id": "VW8cpQ6FNvC9"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Define preprocessing\n",
"numeric_features = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance']\n",
"numeric_transformer = StandardScaler()\n",
"\n",
"categorical_features = ['category', 'gender', 'job', 'merchant']\n",
"categorical_transformer = OneHotEncoder(handle_unknown='ignore')\n",
"\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numeric_transformer, numeric_features),\n",
" ('cat', categorical_transformer, categorical_features)\n",
" ])"
],
"metadata": {
"id": "GkGMEEYqOc_p"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Create pipeline\n",
"model = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))\n",
"])\n"
],
"metadata": {
"id": "s2VyUaV6OZw9"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Train model\n",
"model.fit(X_train, y_train)"
],
"metadata": {
"id": "6dXK4nJnOW40"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Evaluate on test data\n",
"y_pred = model.predict(X_test)\n",
"print(\"Test Set Performance:\")\n",
"print(classification_report(y_test, y_pred))"
],
"metadata": {
"id": "PNV9fZhkOVFM"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Save model\n",
"joblib.dump(model, 'fraud_model.pkl')\n",
"print(\"Model saved to fraud_model.pkl\")"
],
"metadata": {
"id": "4j2mlzRINc_N"
},
"execution_count": null,
"outputs": []
}
]
}