{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "data_dummy = {\n", " 'start_date': pd.date_range(start='2023-01-01', periods=12, freq='7D'),\n", " 'end_date': pd.date_range(start='2023-01-02', periods=12, freq='7D'),\n", " 'open_items': [10, 12, 11, 9, 13, 14, 15, 16, 12, 11, 10, 9],\n", " 'red_flags': [2, 1, 3, 1, 4, 2, 1, 3, 2, 1, 4, 3],\n", " 'num_employees': [30, 25, 28, 30, 27, 26, 31, 29, 25, 30, 27, 26],\n", " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly', \n", " 'weekly', 'quarterly', 'biweekly', 'weekly', 'quarterly', 'weekly', 'biweekly']\n", "}\n", "\n", "df_dummy = pd.DataFrame(data_dummy)\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
start_dateend_dateopen_itemsred_flagsnum_employeesassessment_type
02023-01-012023-01-0210230weekly
12023-01-082023-01-0912125biweekly
22023-01-152023-01-1611328quarterly
32023-01-222023-01-239130weekly
42023-01-292023-01-3013427biweekly
52023-02-052023-02-0614226weekly
62023-02-122023-02-1315131quarterly
72023-02-192023-02-2016329biweekly
82023-02-262023-02-2712225weekly
92023-03-052023-03-0611130quarterly
102023-03-122023-03-1310427weekly
112023-03-192023-03-209326biweekly
\n", "
" ], "text/plain": [ " start_date end_date open_items red_flags num_employees assessment_type\n", "0 2023-01-01 2023-01-02 10 2 30 weekly\n", "1 2023-01-08 2023-01-09 12 1 25 biweekly\n", "2 2023-01-15 2023-01-16 11 3 28 quarterly\n", "3 2023-01-22 2023-01-23 9 1 30 weekly\n", "4 2023-01-29 2023-01-30 13 4 27 biweekly\n", "5 2023-02-05 2023-02-06 14 2 26 weekly\n", "6 2023-02-12 2023-02-13 15 1 31 quarterly\n", "7 2023-02-19 2023-02-20 16 3 29 biweekly\n", "8 2023-02-26 2023-02-27 12 2 25 weekly\n", "9 2023-03-05 2023-03-06 11 1 30 quarterly\n", "10 2023-03-12 2023-03-13 10 4 27 weekly\n", "11 2023-03-19 2023-03-20 9 3 26 biweekly" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_dummy" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df_dummy.to_csv(\"test_data.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
start_dateend_dateopen_itemsred_flagsnum_employeesassessment_type
02023-01-012023-01-0210230weekly
12023-01-082023-01-0912125biweekly
22023-01-152023-01-1611328quarterly
32023-01-222023-01-239130weekly
42023-01-292023-01-3013427biweekly
\n", "
" ], "text/plain": [ " start_date end_date open_items red_flags num_employees assessment_type\n", "0 2023-01-01 2023-01-02 10 2 30 weekly\n", "1 2023-01-08 2023-01-09 12 1 25 biweekly\n", "2 2023-01-15 2023-01-16 11 3 28 quarterly\n", "3 2023-01-22 2023-01-23 9 1 30 weekly\n", "4 2023-01-29 2023-01-30 13 4 27 biweekly" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_dummy.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
start_dateend_dateopen_itemsred_flagsnum_employeesassessment_type_biweeklyassessment_type_quarterlyassessment_type_weeklyopen_items_weekly_lag_1open_items_biweekly_lag_1open_items_quarterly_lag_1open_items_weekly_ma_3open_items_biweekly_ma_3open_items_quarterly_ma_3time_since_last_eventpercentage_change_open_items
22023-01-152023-01-1611328FalseTrueFalse0.00.012.00.0000000.011.07.0-8.333333
32023-01-222023-01-239130FalseFalseTrue11.00.00.010.6666670.00.07.0-18.181818
42023-01-292023-01-3013427TrueFalseFalse0.09.00.00.00000011.00.07.044.444444
\n", "
" ], "text/plain": [ " start_date end_date open_items red_flags num_employees \\\n", "2 2023-01-15 2023-01-16 11 3 28 \n", "3 2023-01-22 2023-01-23 9 1 30 \n", "4 2023-01-29 2023-01-30 13 4 27 \n", "\n", " assessment_type_biweekly assessment_type_quarterly \\\n", "2 False True \n", "3 False False \n", "4 True False \n", "\n", " assessment_type_weekly open_items_weekly_lag_1 open_items_biweekly_lag_1 \\\n", "2 False 0.0 0.0 \n", "3 True 11.0 0.0 \n", "4 False 0.0 9.0 \n", "\n", " open_items_quarterly_lag_1 open_items_weekly_ma_3 \\\n", "2 12.0 0.000000 \n", "3 0.0 10.666667 \n", "4 0.0 0.000000 \n", "\n", " open_items_biweekly_ma_3 open_items_quarterly_ma_3 time_since_last_event \\\n", "2 0.0 11.0 7.0 \n", "3 0.0 0.0 7.0 \n", "4 11.0 0.0 7.0 \n", "\n", " percentage_change_open_items \n", "2 -8.333333 \n", "3 -18.181818 \n", "4 44.444444 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "# Create a dummy dataset with past 5 assessments\n", "data_dummy = {\n", " 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n", " 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n", " 'open_items': [10, 12, 11, 9, 13],\n", " 'red_flags': [2, 1, 3, 1, 4],\n", " 'num_employees': [30, 25, 28, 30, 27],\n", " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n", "}\n", "\n", "df_dummy = pd.DataFrame(data_dummy)\n", "\n", "# Convert 'assessment_type' to categorical (one-hot encoding)\n", "df_dummy = pd.get_dummies(df_dummy, columns=['assessment_type'], drop_first=False)\n", "\n", "# Create lagged features for each assessment type\n", "df_dummy['open_items_weekly_lag_1'] = df_dummy['open_items'].shift(1) * df_dummy['assessment_type_weekly']\n", "df_dummy['open_items_biweekly_lag_1'] = df_dummy['open_items'].shift(1) * df_dummy['assessment_type_biweekly']\n", "df_dummy['open_items_quarterly_lag_1'] = df_dummy['open_items'].shift(1) * df_dummy['assessment_type_quarterly']\n", "\n", "# Create moving averages for each assessment type\n", "df_dummy['open_items_weekly_ma_3'] = df_dummy['open_items'].rolling(window=3).mean() * df_dummy['assessment_type_weekly']\n", "df_dummy['open_items_biweekly_ma_3'] = df_dummy['open_items'].rolling(window=3).mean() * df_dummy['assessment_type_biweekly']\n", "df_dummy['open_items_quarterly_ma_3'] = df_dummy['open_items'].rolling(window=3).mean() * df_dummy['assessment_type_quarterly']\n", "\n", "# Add time since last event (days between assessments)\n", "df_dummy['start_date'] = pd.to_datetime(df_dummy['start_date'])\n", "df_dummy['time_since_last_event'] = df_dummy['start_date'].diff().dt.days\n", "\n", "# Add percentage change in open items\n", "df_dummy['percentage_change_open_items'] = df_dummy['open_items'].pct_change() * 100\n", "\n", "# Remove any rows with NaN values created by lagging or rolling window calculations\n", "df_dummy.dropna(inplace=True)\n", "\n", "# Display the final DataFrame with all time-based features\n", "df_dummy.head() \n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# Create a dummy dataset with past 5 assessments\n", "data_dummy = {\n", " 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n", " 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n", " 'open_items': [10, 12, 11, 9, 13],\n", " 'red_flags': [2, 1, 3, 1, 4],\n", " 'num_employees': [30, 25, 28, 30, 27],\n", " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n", "}\n", "\n", "df_dummy = pd.DataFrame(data_dummy)\n", "\n", "# Save the DataFrame as a CSV file\n", "df_dummy.to_csv('dummy_assessment_data.csv', index=False)\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
start_dateend_dateopen_itemsred_flagsnum_employeesassessment_type_biweeklyassessment_type_quarterlyassessment_type_weeklyopen_items_assessment_type_weekly_lag_1open_items_assessment_type_biweekly_lag_1open_items_assessment_type_quarterly_lag_1open_items_weekly_ma_3open_items_biweekly_ma_3open_items_quarterly_ma_3time_since_last_eventpercentage_change_open_items
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [start_date, end_date, open_items, red_flags, num_employees, assessment_type_biweekly, assessment_type_quarterly, assessment_type_weekly, open_items_assessment_type_weekly_lag_1, open_items_assessment_type_biweekly_lag_1, open_items_assessment_type_quarterly_lag_1, open_items_weekly_ma_3, open_items_biweekly_ma_3, open_items_quarterly_ma_3, time_since_last_event, percentage_change_open_items]\n", "Index: []" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "# Create a dummy dataset with past 5 assessments\n", "data_dummy = {\n", " 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n", " 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n", " 'open_items': [10, 12, 11, 9, 13],\n", " 'red_flags': [2, 1, 3, 1, 4],\n", " 'num_employees': [30, 25, 28, 30, 27],\n", " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n", "}\n", "\n", "df_dummy = pd.DataFrame(data_dummy)\n", "\n", "# Convert 'assessment_type' to categorical (one-hot encoding)\n", "df_dummy = pd.get_dummies(df_dummy, columns=['assessment_type'], drop_first=False)\n", "\n", "# Define a function to create lagged features based on assessment type\n", "def create_lagged_features(df, col, assessment_col):\n", " lagged_col = f\"{col}_{assessment_col}_lag_1\"\n", " df[lagged_col] = df[df[assessment_col] == 1][col].shift(1)\n", " return df\n", "\n", "# Create lagged features for each assessment type\n", "df_dummy = create_lagged_features(df_dummy, 'open_items', 'assessment_type_weekly')\n", "df_dummy = create_lagged_features(df_dummy, 'open_items', 'assessment_type_biweekly')\n", "df_dummy = create_lagged_features(df_dummy, 'open_items', 'assessment_type_quarterly')\n", "\n", "# Fill NaNs with 0 or forward-fill them depending on how you want to handle missing lags\n", "df_dummy.fillna(0, inplace=True)\n", "\n", "# Create moving averages for each assessment type\n", "df_dummy['open_items_weekly_ma_3'] = df_dummy[df_dummy['assessment_type_weekly'] == 1]['open_items'].rolling(window=3).mean()\n", "df_dummy['open_items_biweekly_ma_3'] = df_dummy[df_dummy['assessment_type_biweekly'] == 1]['open_items'].rolling(window=3).mean()\n", "df_dummy['open_items_quarterly_ma_3'] = df_dummy[df_dummy['assessment_type_quarterly'] == 1]['open_items'].rolling(window=3).mean()\n", "\n", "# Add time since last event (days between assessments)\n", "df_dummy['start_date'] = pd.to_datetime(df_dummy['start_date'])\n", "df_dummy['time_since_last_event'] = df_dummy['start_date'].diff().dt.days\n", "\n", "# Add percentage change in open items\n", "df_dummy['percentage_change_open_items'] = df_dummy['open_items'].pct_change() * 100\n", "\n", "# Remove any rows with NaN values created by lagging or rolling window calculations\n", "df_dummy.dropna(inplace=True)\n", "\n", "# Display the final DataFrame with all time-based features\n", "df_dummy.head()\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
start_dateend_dateopen_itemsred_flagsnum_employeesassessment_type_biweeklyassessment_type_quarterlyassessment_type_weeklyopen_items_assessment_type_weekly_lag_1open_items_assessment_type_biweekly_lag_1open_items_assessment_type_quarterly_lag_1open_items_weekly_ma_3open_items_biweekly_ma_3open_items_quarterly_ma_3time_since_last_eventpercentage_change_open_items
02023-01-012023-01-02102300010.00.00.010.00.00.00.00.000000
12023-01-082023-01-091212510010.00.00.010.012.00.07.020.000000
22023-01-152023-01-16113280100.012.00.010.012.011.07.0-8.333333
32023-01-222023-01-2391300010.00.011.09.012.011.07.0-18.181818
42023-01-292023-01-30134271009.00.00.09.013.011.07.044.444444
\n", "
" ], "text/plain": [ " start_date end_date open_items red_flags num_employees \\\n", "0 2023-01-01 2023-01-02 10 2 30 \n", "1 2023-01-08 2023-01-09 12 1 25 \n", "2 2023-01-15 2023-01-16 11 3 28 \n", "3 2023-01-22 2023-01-23 9 1 30 \n", "4 2023-01-29 2023-01-30 13 4 27 \n", "\n", " assessment_type_biweekly assessment_type_quarterly \\\n", "0 0 0 \n", "1 1 0 \n", "2 0 1 \n", "3 0 0 \n", "4 1 0 \n", "\n", " assessment_type_weekly open_items_assessment_type_weekly_lag_1 \\\n", "0 1 0.0 \n", "1 0 10.0 \n", "2 0 0.0 \n", "3 1 0.0 \n", "4 0 9.0 \n", "\n", " open_items_assessment_type_biweekly_lag_1 \\\n", "0 0.0 \n", "1 0.0 \n", "2 12.0 \n", "3 0.0 \n", "4 0.0 \n", "\n", " open_items_assessment_type_quarterly_lag_1 open_items_weekly_ma_3 \\\n", "0 0.0 10.0 \n", "1 0.0 10.0 \n", "2 0.0 10.0 \n", "3 11.0 9.0 \n", "4 0.0 9.0 \n", "\n", " open_items_biweekly_ma_3 open_items_quarterly_ma_3 time_since_last_event \\\n", "0 0.0 0.0 0.0 \n", "1 12.0 0.0 7.0 \n", "2 12.0 11.0 7.0 \n", "3 12.0 11.0 7.0 \n", "4 13.0 11.0 7.0 \n", "\n", " percentage_change_open_items \n", "0 0.000000 \n", "1 20.000000 \n", "2 -8.333333 \n", "3 -18.181818 \n", "4 44.444444 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import os\n", "\n", "class DataPreprocessor:\n", " def __init__(self, input_path, output_dir):\n", " self.input_path = input_path\n", " self.output_dir = output_dir\n", " self.df = None\n", "\n", " def load_data(self):\n", " self.df = pd.read_csv(self.input_path)\n", "\n", " def preprocess(self):\n", " # Convert 'assessment_type' to categorical (one-hot encoding)\n", " self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False)\n", "\n", " # Convert boolean columns to 1s and 0s\n", " self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int)\n", " self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int)\n", " self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int)\n", "\n", " # Function to create lagged features based on assessment type\n", " def create_lagged_features(df, col, assessment_col):\n", " lagged_col = f\"{col}_{assessment_col}_lag_1\"\n", " df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)\n", " return df\n", "\n", " # Create lagged features for each assessment type\n", " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly')\n", " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly')\n", " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly')\n", "\n", " # Fill NaNs with 0 instead of dropping rows\n", " self.df.fillna(0, inplace=True)\n", "\n", " # Create moving averages for each assessment type\n", " self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n", " self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n", " self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n", "\n", " # Add time since last event (days between assessments)\n", " self.df['start_date'] = pd.to_datetime(self.df['start_date'])\n", " self.df['time_since_last_event'] = self.df['start_date'].diff().dt.days.fillna(0)\n", "\n", " # Add percentage change in open items\n", " self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100\n", "\n", " def save_data(self):\n", " output_path = os.path.join(self.output_dir, 'preprocessed_data.csv')\n", " self.df.to_csv(output_path, index=False)\n", " return output_path\n", "\n", " def run(self):\n", " self.load_data()\n", " self.preprocess()\n", " return self.save_data()\n", "\n", "\n", "preprocessor = DataPreprocessor('path/to/input.csv', 'path/to/output/directory')\n", "output_file = preprocessor.run()\n", "# print(f\"Preprocessed data saved to: {output_file}\")\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
start_dateend_dateopen_itemsred_flagsnum_employeesassessment_type_biweeklyassessment_type_quarterlyassessment_type_weeklyopen_items_assessment_type_weekly_lag_1open_items_assessment_type_biweekly_lag_1open_items_assessment_type_quarterly_lag_1open_items_weekly_ma_3open_items_biweekly_ma_3open_items_quarterly_ma_3time_since_last_eventpercentage_change_open_items
22023-01-152023-01-1611328FalseTrueFalse0.012.00.010.012.011.07.0-8.333333
32023-01-222023-01-239130FalseFalseTrue0.00.011.09.012.011.07.0-18.181818
42023-01-292023-01-3013427TrueFalseFalse9.00.00.09.013.011.07.044.444444
\n", "
" ], "text/plain": [ " start_date end_date open_items red_flags num_employees \\\n", "2 2023-01-15 2023-01-16 11 3 28 \n", "3 2023-01-22 2023-01-23 9 1 30 \n", "4 2023-01-29 2023-01-30 13 4 27 \n", "\n", " assessment_type_biweekly assessment_type_quarterly \\\n", "2 False True \n", "3 False False \n", "4 True False \n", "\n", " assessment_type_weekly open_items_assessment_type_weekly_lag_1 \\\n", "2 False 0.0 \n", "3 True 0.0 \n", "4 False 9.0 \n", "\n", " open_items_assessment_type_biweekly_lag_1 \\\n", "2 12.0 \n", "3 0.0 \n", "4 0.0 \n", "\n", " open_items_assessment_type_quarterly_lag_1 open_items_weekly_ma_3 \\\n", "2 0.0 10.0 \n", "3 11.0 9.0 \n", "4 0.0 9.0 \n", "\n", " open_items_biweekly_ma_3 open_items_quarterly_ma_3 time_since_last_event \\\n", "2 12.0 11.0 7.0 \n", "3 12.0 11.0 7.0 \n", "4 13.0 11.0 7.0 \n", "\n", " percentage_change_open_items \n", "2 -8.333333 \n", "3 -18.181818 \n", "4 44.444444 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(5)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
start_dateend_dateopen_itemsred_flagsnum_employeesassessment_type
02023-01-012023-01-0210230weekly
12023-01-082023-01-0912125biweekly
22023-01-152023-01-1611328quarterly
32023-01-222023-01-239130weekly
42023-01-292023-01-3013427biweekly
\n", "
" ], "text/plain": [ " start_date end_date open_items red_flags num_employees assessment_type\n", "0 2023-01-01 2023-01-02 10 2 30 weekly\n", "1 2023-01-08 2023-01-09 12 1 25 biweekly\n", "2 2023-01-15 2023-01-16 11 3 28 quarterly\n", "3 2023-01-22 2023-01-23 9 1 30 weekly\n", "4 2023-01-29 2023-01-30 13 4 27 biweekly" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_dummy.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# Create a dummy dataset with past 5 assessments\n", "data_dummy = {\n", " 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n", " 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n", " 'open_items': [10, 12, 11, 9, 13],\n", " 'red_flags': [2, 1, 3, 1, 4],\n", " 'num_employees': [30, 25, 28, 30, 27],\n", " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n", "}\n", "\n", "df = pd.DataFrame(data_dummy)\n", "\n", "# Convert 'assessment_type' to categorical (one-hot encoding)\n", "df = pd.get_dummies(df, columns=['assessment_type'], drop_first=False)\n", "\n", "# Convert boolean columns to 1s and 0s\n", "df['assessment_type_weekly'] = df['assessment_type_weekly'].astype(int)\n", "df['assessment_type_biweekly'] = df['assessment_type_biweekly'].astype(int)\n", "df['assessment_type_quarterly'] = df['assessment_type_quarterly'].astype(int)\n", "\n", "# Function to create lagged features based on assessment type\n", "def create_lagged_features(df, col, assessment_col):\n", " lagged_col = f\"{col}_{assessment_col}_lag_1\"\n", " df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)\n", " return df\n", "\n", "# Create lagged features for each assessment type\n", "df = create_lagged_features(df, 'open_items', 'assessment_type_weekly')\n", "df = create_lagged_features(df, 'open_items', 'assessment_type_biweekly')\n", "df = create_lagged_features(df, 'open_items', 'assessment_type_quarterly')\n", "\n", "# Fill NaNs with 0 instead of dropping rows\n", "df.fillna(0, inplace=True)\n", "\n", "# Create moving averages for each assessment type\n", "df['open_items_weekly_ma_3'] = df['open_items'].where(df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n", "df['open_items_biweekly_ma_3'] = df['open_items'].where(df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n", "df['open_items_quarterly_ma_3'] = df['open_items'].where(df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n", "\n", "# Add time since last event (days between assessments)\n", "df['start_date'] = pd.to_datetime(df['start_date'])\n", "df['time_since_last_event'] = df['start_date'].diff().dt.days.fillna(0)\n", "\n", "# Add percentage change in open items\n", "df['percentage_change_open_items'] = df['open_items'].pct_change().fillna(0) * 100\n", "\n", "# Display the final DataFrame with all time-based features\n", "df.head()\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "\n", "class DataPreprocessor:\n", " def __init__(self, input_path, company_id):\n", " self.input_path = input_path\n", " self.output_dir = os.path.join('data', 'processed', 'assessment_prediction', company_id)\n", " self.company_id = company_id\n", " self.df = None\n", "\n", " def load_data(self):\n", " self.df = pd.read_csv(self.input_path)\n", "\n", " def preprocess(self):\n", " # Convert 'start_date' and 'end_date' to datetime\n", " self.df['start_date'] = pd.to_datetime(self.df['start_date'])\n", " self.df['end_date'] = pd.to_datetime(self.df['end_date'])\n", "\n", " # Add duration (in days) by subtracting start_date from end_date\n", " self.df['duration'] = (self.df['end_date'] - self.df['start_date']).dt.days\n", "\n", " # Drop the 'start_date' and 'end_date' columns as they are not needed for training\n", " self.df.drop(columns=['start_date', 'end_date'], inplace=True)\n", "\n", " # Convert 'assessment_type' to categorical (one-hot encoding)\n", " self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False)\n", "\n", " # Convert boolean columns to 1s and 0s\n", " self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int)\n", " self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int)\n", " self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int)\n", "\n", " # Function to create lagged features based on assessment type\n", " def create_lagged_features(df, col, assessment_col):\n", " lagged_col = f\"{col}_{assessment_col}_lag_1\"\n", " df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)\n", " return df\n", "\n", " # Create lagged features for each assessment type\n", " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly')\n", " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly')\n", " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly')\n", "\n", " # Fill NaNs with 0 instead of dropping rows\n", " self.df.fillna(0, inplace=True)\n", "\n", " # Create moving averages for each assessment type\n", " self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n", " self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n", " self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n", "\n", " # Add percentage change in open items\n", " self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100\n", "\n", " def save_data(self):\n", " os.makedirs(self.output_dir, exist_ok=True) # Ensure output directory exists\n", " output_path = os.path.join(self.output_dir, 'output.csv')\n", " self.df.to_csv(output_path, index=False)\n", " return output_path\n", "\n", " def run(self):\n", " self.load_data()\n", " self.preprocess()\n", " return self.save_data()\n", "\n", "# Example usage:\n", "# preprocessor = DataPreprocessor(input_path='path_to_raw_data.csv', company_id='company_123')\n", "# processed_data_path = preprocessor.run()\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "dp = DataPreprocessor(\n", " input_path=\"/root/ds_erp_ai/data/raw/dummy_assessment_data.csv\",\n", " company_id=\"testid\"\n", ")\n", "\n", "\n", "res = dp.run()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'data/processed/assessment_prediction/testid/output.csv'" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model saved to models/assessment_prediction/testid/testid_model.pkl\n", "Latest assessment data saved to models/assessment_prediction/testid/testid_latest_data.csv\n", "Model Evaluation Metrics:\n", "Mean Absolute Error (MAE): 1.3099999999999996\n", "Mean Squared Error (MSE): 2.3089999999999997\n", "R-squared (R²): nan\n", "The model was saved at: models/assessment_prediction/testid/testid_model.pkl\n", "The latest data was saved at: models/assessment_prediction/testid/testid_latest_data.csv\n", "Evaluation Results: {'mae': 1.3099999999999996, 'mse': 2.3089999999999997, 'r2': nan}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/root/ds_erp_ai/erp/lib/python3.10/site-packages/sklearn/metrics/_regression.py:1211: UndefinedMetricWarning: R^2 score is not well-defined with less than two samples.\n", " warnings.warn(msg, UndefinedMetricWarning)\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.multioutput import MultiOutputRegressor\n", "import joblib\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "\n", "class ModelTrainer:\n", " def __init__(self, preprocessed_data_path, company_id, model):\n", " self.preprocessed_data_path = preprocessed_data_path\n", " self.output_dir = os.path.join('models', 'assessment_prediction', company_id)\n", " self.company_id = company_id\n", " self.df = None\n", " self.model = model # Model passed as an argument\n", " self.X_test = None\n", " self.y_test = None\n", "\n", " def load_data(self):\n", " self.df = pd.read_csv(self.preprocessed_data_path)\n", "\n", " def train_model(self):\n", " # Split data into features (X) and target variables (y)\n", " X = self.df.drop(columns=['open_items', 'red_flags'])\n", " y = self.df[['open_items', 'red_flags']] # Multi-target for open items and red flags\n", "\n", " # Split into training and test sets with 10% as test size\n", " X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.1, random_state=42)\n", "\n", " # Train the model\n", " self.model.fit(X_train, y_train)\n", "\n", " # Save the trained model\n", " os.makedirs(self.output_dir, exist_ok=True)\n", " model_path = os.path.join(self.output_dir, f'{self.company_id}_model.pkl')\n", " joblib.dump(self.model, model_path)\n", " print(f\"Model saved to {model_path}\")\n", "\n", " # Save the latest row (last assessment data) for inference\n", " latest_data_path = os.path.join(self.output_dir, f'{self.company_id}_latest_data.csv')\n", " self.df.tail(1).to_csv(latest_data_path, index=False)\n", " print(f\"Latest assessment data saved to {latest_data_path}\")\n", "\n", " # Return the model path and latest data path\n", " return model_path, latest_data_path\n", "\n", " def evaluate_model(self):\n", " # Predict using the test data\n", " y_pred = self.model.predict(self.X_test)\n", "\n", " # Calculate evaluation metrics\n", " mae = mean_absolute_error(self.y_test, y_pred)\n", " mse = mean_squared_error(self.y_test, y_pred)\n", " r2 = r2_score(self.y_test, y_pred)\n", "\n", " print(\"Model Evaluation Metrics:\")\n", " print(f\"Mean Absolute Error (MAE): {mae}\")\n", " print(f\"Mean Squared Error (MSE): {mse}\")\n", " print(f\"R-squared (R²): {r2}\")\n", "\n", " # Return evaluation results\n", " return {'mae': mae, 'mse': mse, 'r2': r2}\n", "\n", " def run(self):\n", " # Load data and train the model\n", " self.load_data()\n", " model_path, latest_data_path = self.train_model()\n", "\n", " # Evaluate the model immediately after training\n", " evaluation_results = self.evaluate_model()\n", "\n", " return model_path, latest_data_path, evaluation_results\n", "\n", "# Example usage\n", "model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))\n", "trainer = ModelTrainer(preprocessed_data_path=res, company_id='testid', model=model)\n", "model_path, latest_data_path, evaluation_results = trainer.run()\n", "print(f\"The model was saved at: {model_path}\")\n", "print(f\"The latest data was saved at: {latest_data_path}\")\n", "print(f\"Evaluation Results: {evaluation_results}\")\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'data/processed/assessment_prediction/testid/output.csv'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'models/assessment_prediction/testid/testid_model.pkl'" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "e" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model loaded from models/assessment_prediction/testid/testid_model.pkl\n", "Latest data loaded from models/assessment_prediction/testid/testid_latest_data.csv\n", "\n", "Forecasting assessment 1/5\n", "\n", "Forecasting assessment 2/5\n", "\n", "Forecasting assessment 3/5\n", "\n", "Forecasting assessment 4/5\n", "\n", "Forecasting assessment 5/5\n", "[{'forecast_step': 1, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 2, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 3, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 4, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 5, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}]\n" ] } ], "source": [ "import pandas as pd\n", "import joblib\n", "import os\n", "\n", "class AssessmentInference:\n", " def __init__(self, company_id, num_assessments, model_dir='models'):\n", " self.company_id = company_id\n", " self.num_assessments = num_assessments\n", " self.model_dir = model_dir\n", " self.model = None\n", " self.latest_data = None\n", "\n", " def load_model(self):\n", " # Load the trained model\n", " model_path = os.path.join(self.model_dir, 'assessment_prediction', self.company_id, f'{self.company_id}_model.pkl')\n", " self.model = joblib.load(model_path)\n", " print(f\"Model loaded from {model_path}\")\n", "\n", " def load_latest_data(self):\n", " # Load the latest assessment data\n", " latest_data_path = os.path.join(self.model_dir, 'assessment_prediction', self.company_id, f'{self.company_id}_latest_data.csv')\n", " self.latest_data = pd.read_csv(latest_data_path)\n", " print(f\"Latest data loaded from {latest_data_path}\")\n", "\n", " def predict_next_assessment(self, current_data, assessment_type):\n", " # Update assessment type (weekly, biweekly, quarterly) in the data for prediction\n", " current_data['assessment_type_weekly'] = 1 if assessment_type == 'weekly' else 0\n", " current_data['assessment_type_biweekly'] = 1 if assessment_type == 'biweekly' else 0\n", " current_data['assessment_type_quarterly'] = 1 if assessment_type == 'quarterly' else 0\n", "\n", " # Exclude target variables (open_items, red_flags) from the feature set\n", " features = current_data.drop(columns=['open_items', 'red_flags'])\n", "\n", " # Predict the next open items and red flags\n", " prediction = self.model.predict(features)\n", " open_items_pred, red_flags_pred = prediction[0]\n", "\n", " # Ensure the predictions are integers by rounding\n", " open_items_pred = int(round(open_items_pred))\n", " red_flags_pred = int(round(red_flags_pred))\n", "\n", " return {\n", " 'assessment_type': assessment_type,\n", " 'open_items': open_items_pred,\n", " 'red_flags': red_flags_pred\n", " }\n", "\n", " def predict_next_assessments(self):\n", " predictions = []\n", " current_data = self.latest_data.copy()\n", "\n", " # Iteratively forecast the next assessments\n", " for i in range(self.num_assessments):\n", " print(f\"\\nForecasting assessment {i + 1}/{self.num_assessments}\")\n", "\n", " # Predict for weekly, biweekly, and quarterly for the same forecast step\n", " weekly_prediction = self.predict_next_assessment(current_data, 'weekly')\n", " biweekly_prediction = self.predict_next_assessment(current_data, 'biweekly')\n", " quarterly_prediction = self.predict_next_assessment(current_data, 'quarterly')\n", "\n", " # Append predictions for all types in one forecast step\n", " predictions.append({\n", " 'forecast_step': i + 1,\n", " 'weekly': weekly_prediction,\n", " 'biweekly': biweekly_prediction,\n", " 'quarterly': quarterly_prediction\n", " })\n", "\n", " # Update the current data with the weekly prediction (or any of the predictions) for the next step\n", " current_data['open_items'] = weekly_prediction['open_items']\n", " current_data['red_flags'] = weekly_prediction['red_flags']\n", "\n", " return predictions\n", "\n", " def run(self):\n", " self.load_model()\n", " self.load_latest_data()\n", " predictions = self.predict_next_assessments()\n", " return predictions\n", "\n", "\n", "# Example usage\n", "inference = AssessmentInference(company_id='testid', num_assessments=5)\n", "predictions = inference.run()\n", "print(predictions)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dummy assessment data has been saved as dummy_company_asseement_data.csv.\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Create dummy assessment data\n", "data = {\n", " 'Assessment_ID': range(1, 11),\n", " 'Open_Items': [3, 4, 2, 5, 1, 3, 2, 4, 5, 3],\n", " 'Red_Flags': [1, 2, 0, 1, 0, 3, 2, 1, 1, 2],\n", " 'Assessment_Frequency': ['Weekly', 'Bi-Weekly', 'Weekly', 'Quarterly', 'Bi-Weekly', 'Weekly', 'Quarterly', 'Bi-Weekly', 'Weekly', 'Quarterly'],\n", " 'Assessment_Start_Date': pd.date_range(start='2023-01-01', periods=10, freq='15D'),\n", " 'Assessment_End_Date': pd.date_range(start='2023-01-07', periods=10, freq='15D'),\n", " 'Assessment_Area': ['Deployment', 'Communication', 'Deployment', 'Communication', 'Deployment', 'Deployment', 'Communication', 'Deployment', 'Communication', 'Deployment'],\n", " 'Assessment_Status': ['Completed', 'Completed', 'Completed', 'In Progress', 'Completed', 'Completed', 'Incomplete', 'Completed', 'In Progress', 'Completed'],\n", " 'Assessment_Admin': ['Admin A', 'Admin B', 'Admin A', 'Admin B', 'Admin A', 'Admin A', 'Admin B', 'Admin A', 'Admin B', 'Admin A']\n", "}\n", "\n", "# Create DataFrame\n", "df = pd.DataFrame(data)\n", "\n", "# Save DataFrame to CSV\n", "csv_file_path = 'dummy_company_asseement_data.csv'\n", "df.to_csv(csv_file_path, index=False)\n", "\n", "print(f\"Dummy assessment data has been saved as {csv_file_path}.\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "erp", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }