Files
task_fraud_detection_bolade/experiments/01_EDA_fraud_detection.ipynb
T
boladeE 50e95445fb First commit
Defined file structure and completed EDA
2025-04-24 23:39:36 +01:00

160 lines
3.3 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "2c5baf8e",
"metadata": {},
"source": [
"# 📊 Exploratory Data Analysis: Fraud Detection Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f3e6a97",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"df = pd.read_csv(\"fraudTest.csv\")\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "2bcadae6",
"metadata": {},
"source": [
"## 🧾 Basic Overview of the Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "820cb0e9",
"metadata": {},
"outputs": [],
"source": [
"print(\"Shape:\", df.shape)\n",
"print(\"\\nData Types:\\n\", df.dtypes)\n",
"print(\"\\nMissing Values:\\n\", df.isnull().sum())\n",
"print(\"\\nDuplicate Rows:\", df.duplicated().sum())"
]
},
{
"cell_type": "markdown",
"id": "caa22db9",
"metadata": {},
"source": [
"## ⚖️ Class Balance"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fb75259",
"metadata": {},
"outputs": [],
"source": [
"sns.countplot(data=df, x=\"is_fraud\")\n",
"plt.title(\"Fraud vs Non-Fraud Transactions\")\n",
"plt.show()\n",
"\n",
"fraud_ratio = df[\"is_fraud\"].mean()\n",
"print(f\"Fraudulent transactions: {fraud_ratio:.4%}\")"
]
},
{
"cell_type": "markdown",
"id": "658e9cd2",
"metadata": {},
"source": [
"## 📊 Statistical Summary"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "202e2612",
"metadata": {},
"outputs": [],
"source": [
"df.describe(include='all')"
]
},
{
"cell_type": "markdown",
"id": "12d24a95",
"metadata": {},
"source": [
"## 🔗 Correlation Matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c02acf0",
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(12, 8))\n",
"sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=\".2f\", cmap=\"coolwarm\")\n",
"plt.title(\"Feature Correlation Matrix\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "fce8183a",
"metadata": {},
"source": [
"## 💵 Transaction Amount Distribution by Fraud"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea72b131",
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(data=df, x='is_fraud', y='amt')\n",
"plt.yscale('log')\n",
"plt.title(\"Transaction Amount by Fraud Status\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "a7d7d378",
"metadata": {},
"source": [
"## 🕒 Transaction Timing (Hourly)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f26f36f",
"metadata": {},
"outputs": [],
"source": [
"df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])\n",
"df['hour'] = df['trans_date_trans_time'].dt.hour\n",
"\n",
"plt.figure(figsize=(12, 6))\n",
"sns.histplot(data=df, x='hour', hue='is_fraud', multiple='stack', bins=24)\n",
"plt.title(\"Transaction Hour Distribution\")\n",
"plt.show()"
]
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}