From cd8f499f977dc371da5860c2dc94e3052b6e0dcb Mon Sep 17 00:00:00 2001
From: kowshik <kowshik@mkd.com>
Date: Sat, 14 Sep 2024 01:50:41 +0000
Subject: [PATCH] added bot prediction for assessments

---
 .../testid_raw_data.2csv                      |  11 +
 .../testid_raw_data.csv                       |  11 +
 notebooks/dummy_assessment_data.csv           |  11 +
 notebooks/test_prediction_pipeline.ipynb      | 239 +++++++++++++++++-
 run.py                                        |   2 +-
 scripts/assessment_data.py                    |  70 +++++
 setup.py                                      |   0
 src/api/routes/chatbot.py                     |  62 +++++
 src/models/bot_response_schema.py             |  19 +-
 src/pipeline/data_preprocessor.py             |   5 -
 src/pipeline/model_trainer.py                 |   3 +-
 src/prompts/chatbot.py                        | 133 +++++++++-
 src/services/chatbot.py                       | 121 ++++++++-
 test.py                                       |  33 ++-
 14 files changed, 698 insertions(+), 22 deletions(-)
 create mode 100644 data/raw/erp_company_assessment/testid_raw_data.2csv
 create mode 100644 data/raw/erp_company_assessment/testid_raw_data.csv
 create mode 100644 notebooks/dummy_assessment_data.csv
 create mode 100644 scripts/assessment_data.py
 delete mode 100644 setup.py

diff --git a/data/raw/erp_company_assessment/testid_raw_data.2csv b/data/raw/erp_company_assessment/testid_raw_data.2csv
new file mode 100644
index 0000000..1cba78e
--- /dev/null
+++ b/data/raw/erp_company_assessment/testid_raw_data.2csv
@@ -0,0 +1,11 @@
+Assessment_ID,Open_Items,Red_Flags,Assessment_Frequency,Assessment_Start_Date,Assessment_End_Date,Assessment_Area,Assessment_Status,Assessment_Admin
+1,3,1,Weekly,2023-01-01,2023-01-07,Deployment,Completed,Admin A
+2,4,2,Bi-Weekly,2023-01-16,2023-01-22,Communication,Completed,Admin B
+3,2,0,Weekly,2023-01-31,2023-02-06,Deployment,Completed,Admin A
+4,5,1,Quarterly,2023-02-15,2023-02-21,Communication,In Progress,Admin B
+5,1,0,Bi-Weekly,2023-03-02,2023-03-08,Deployment,Completed,Admin A
+6,3,3,Weekly,2023-03-17,2023-03-23,Deployment,Completed,Admin A
+7,2,2,Quarterly,2023-04-01,2023-04-07,Communication,Incomplete,Admin B
+8,4,1,Bi-Weekly,2023-04-16,2023-04-22,Deployment,Completed,Admin A
+9,5,1,Weekly,2023-05-01,2023-05-07,Communication,In Progress,Admin B
+10,3,2,Quarterly,2023-05-16,2023-05-22,Deployment,Completed,Admin A
diff --git a/data/raw/erp_company_assessment/testid_raw_data.csv b/data/raw/erp_company_assessment/testid_raw_data.csv
new file mode 100644
index 0000000..77da1b3
--- /dev/null
+++ b/data/raw/erp_company_assessment/testid_raw_data.csv
@@ -0,0 +1,11 @@
+Assessment_ID,Open_Items,Red_Flags,Assessment_Frequency,Assessment_Start_Date,Assessment_End_Date,Assessment_Area,Assessment_Status,Assessment_Admin,Department
+1,3,1,Weekly,2023-01-01,2023-01-07,Deployment,Completed,Admin A,IT
+2,4,2,Bi-Weekly,2023-01-16,2023-01-22,Communication,Completed,Admin B,HR
+3,2,0,Weekly,2023-01-31,2023-02-06,Deployment,Completed,Admin A,Finance
+4,5,1,Quarterly,2023-02-15,2023-02-21,Communication,In Progress,Admin B,IT
+5,1,0,Bi-Weekly,2023-03-02,2023-03-08,Deployment,Completed,Admin A,HR
+6,3,3,Weekly,2023-03-17,2023-03-23,Deployment,Completed,Admin A,Finance
+7,2,2,Quarterly,2023-04-01,2023-04-07,Communication,Incomplete,Admin B,IT
+8,4,1,Bi-Weekly,2023-04-16,2023-04-22,Deployment,Completed,Admin A,HR
+9,5,1,Weekly,2023-05-01,2023-05-07,Communication,In Progress,Admin B,Finance
+10,3,2,Quarterly,2023-05-16,2023-05-22,Deployment,Completed,Admin A,IT
diff --git a/notebooks/dummy_assessment_data.csv b/notebooks/dummy_assessment_data.csv
new file mode 100644
index 0000000..1cba78e
--- /dev/null
+++ b/notebooks/dummy_assessment_data.csv
@@ -0,0 +1,11 @@
+Assessment_ID,Open_Items,Red_Flags,Assessment_Frequency,Assessment_Start_Date,Assessment_End_Date,Assessment_Area,Assessment_Status,Assessment_Admin
+1,3,1,Weekly,2023-01-01,2023-01-07,Deployment,Completed,Admin A
+2,4,2,Bi-Weekly,2023-01-16,2023-01-22,Communication,Completed,Admin B
+3,2,0,Weekly,2023-01-31,2023-02-06,Deployment,Completed,Admin A
+4,5,1,Quarterly,2023-02-15,2023-02-21,Communication,In Progress,Admin B
+5,1,0,Bi-Weekly,2023-03-02,2023-03-08,Deployment,Completed,Admin A
+6,3,3,Weekly,2023-03-17,2023-03-23,Deployment,Completed,Admin A
+7,2,2,Quarterly,2023-04-01,2023-04-07,Communication,Incomplete,Admin B
+8,4,1,Bi-Weekly,2023-04-16,2023-04-22,Deployment,Completed,Admin A
+9,5,1,Weekly,2023-05-01,2023-05-07,Communication,In Progress,Admin B
+10,3,2,Quarterly,2023-05-16,2023-05-22,Deployment,Completed,Admin A
diff --git a/notebooks/test_prediction_pipeline.ipynb b/notebooks/test_prediction_pipeline.ipynb
index 50a7fab..01c4c39 100644
--- a/notebooks/test_prediction_pipeline.ipynb
+++ b/notebooks/test_prediction_pipeline.ipynb
@@ -2,22 +2,204 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Create a dummy dataset with past 5 assessments\n",
     "import pandas as pd\n",
+    "\n",
     "data_dummy = {\n",
-    "    'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
-    "    'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
-    "    'open_items': [10, 12, 11, 9, 13],\n",
-    "    'red_flags': [2, 1, 3, 1, 4],\n",
-    "    'num_employees': [30, 25, 28, 30, 27],\n",
-    "    'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
+    "    'start_date': pd.date_range(start='2023-01-01', periods=12, freq='7D'),\n",
+    "    'end_date': pd.date_range(start='2023-01-02', periods=12, freq='7D'),\n",
+    "    'open_items': [10, 12, 11, 9, 13, 14, 15, 16, 12, 11, 10, 9],\n",
+    "    'red_flags': [2, 1, 3, 1, 4, 2, 1, 3, 2, 1, 4, 3],\n",
+    "    'num_employees': [30, 25, 28, 30, 27, 26, 31, 29, 25, 30, 27, 26],\n",
+    "    'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly', \n",
+    "                        'weekly', 'quarterly', 'biweekly', 'weekly', 'quarterly', 'weekly', 'biweekly']\n",
     "}\n",
     "\n",
-    "df_dummy = pd.DataFrame(data_dummy)"
+    "df_dummy = pd.DataFrame(data_dummy)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start_date</th>\n",
+       "      <th>end_date</th>\n",
+       "      <th>open_items</th>\n",
+       "      <th>red_flags</th>\n",
+       "      <th>num_employees</th>\n",
+       "      <th>assessment_type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2023-01-01</td>\n",
+       "      <td>2023-01-02</td>\n",
+       "      <td>10</td>\n",
+       "      <td>2</td>\n",
+       "      <td>30</td>\n",
+       "      <td>weekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2023-01-08</td>\n",
+       "      <td>2023-01-09</td>\n",
+       "      <td>12</td>\n",
+       "      <td>1</td>\n",
+       "      <td>25</td>\n",
+       "      <td>biweekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2023-01-15</td>\n",
+       "      <td>2023-01-16</td>\n",
+       "      <td>11</td>\n",
+       "      <td>3</td>\n",
+       "      <td>28</td>\n",
+       "      <td>quarterly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2023-01-22</td>\n",
+       "      <td>2023-01-23</td>\n",
+       "      <td>9</td>\n",
+       "      <td>1</td>\n",
+       "      <td>30</td>\n",
+       "      <td>weekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2023-01-29</td>\n",
+       "      <td>2023-01-30</td>\n",
+       "      <td>13</td>\n",
+       "      <td>4</td>\n",
+       "      <td>27</td>\n",
+       "      <td>biweekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>2023-02-05</td>\n",
+       "      <td>2023-02-06</td>\n",
+       "      <td>14</td>\n",
+       "      <td>2</td>\n",
+       "      <td>26</td>\n",
+       "      <td>weekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>2023-02-12</td>\n",
+       "      <td>2023-02-13</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1</td>\n",
+       "      <td>31</td>\n",
+       "      <td>quarterly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>2023-02-19</td>\n",
+       "      <td>2023-02-20</td>\n",
+       "      <td>16</td>\n",
+       "      <td>3</td>\n",
+       "      <td>29</td>\n",
+       "      <td>biweekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>2023-02-26</td>\n",
+       "      <td>2023-02-27</td>\n",
+       "      <td>12</td>\n",
+       "      <td>2</td>\n",
+       "      <td>25</td>\n",
+       "      <td>weekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>2023-03-05</td>\n",
+       "      <td>2023-03-06</td>\n",
+       "      <td>11</td>\n",
+       "      <td>1</td>\n",
+       "      <td>30</td>\n",
+       "      <td>quarterly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>2023-03-12</td>\n",
+       "      <td>2023-03-13</td>\n",
+       "      <td>10</td>\n",
+       "      <td>4</td>\n",
+       "      <td>27</td>\n",
+       "      <td>weekly</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>2023-03-19</td>\n",
+       "      <td>2023-03-20</td>\n",
+       "      <td>9</td>\n",
+       "      <td>3</td>\n",
+       "      <td>26</td>\n",
+       "      <td>biweekly</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   start_date   end_date  open_items  red_flags  num_employees assessment_type\n",
+       "0  2023-01-01 2023-01-02          10          2             30          weekly\n",
+       "1  2023-01-08 2023-01-09          12          1             25        biweekly\n",
+       "2  2023-01-15 2023-01-16          11          3             28       quarterly\n",
+       "3  2023-01-22 2023-01-23           9          1             30          weekly\n",
+       "4  2023-01-29 2023-01-30          13          4             27        biweekly\n",
+       "5  2023-02-05 2023-02-06          14          2             26          weekly\n",
+       "6  2023-02-12 2023-02-13          15          1             31       quarterly\n",
+       "7  2023-02-19 2023-02-20          16          3             29        biweekly\n",
+       "8  2023-02-26 2023-02-27          12          2             25          weekly\n",
+       "9  2023-03-05 2023-03-06          11          1             30       quarterly\n",
+       "10 2023-03-12 2023-03-13          10          4             27          weekly\n",
+       "11 2023-03-19 2023-03-20           9          3             26        biweekly"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_dummy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_dummy.to_csv(\"test_data.csv\",index=False)"
    ]
   },
   {
@@ -1399,6 +1581,45 @@
    "metadata": {},
    "source": []
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dummy assessment data has been saved as dummy_company_asseement_data.csv.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Create dummy assessment data\n",
+    "data = {\n",
+    "    'Assessment_ID': range(1, 11),\n",
+    "    'Open_Items': [3, 4, 2, 5, 1, 3, 2, 4, 5, 3],\n",
+    "    'Red_Flags': [1, 2, 0, 1, 0, 3, 2, 1, 1, 2],\n",
+    "    'Assessment_Frequency': ['Weekly', 'Bi-Weekly', 'Weekly', 'Quarterly', 'Bi-Weekly', 'Weekly', 'Quarterly', 'Bi-Weekly', 'Weekly', 'Quarterly'],\n",
+    "    'Assessment_Start_Date': pd.date_range(start='2023-01-01', periods=10, freq='15D'),\n",
+    "    'Assessment_End_Date': pd.date_range(start='2023-01-07', periods=10, freq='15D'),\n",
+    "    'Assessment_Area': ['Deployment', 'Communication', 'Deployment', 'Communication', 'Deployment', 'Deployment', 'Communication', 'Deployment', 'Communication', 'Deployment'],\n",
+    "    'Assessment_Status': ['Completed', 'Completed', 'Completed', 'In Progress', 'Completed', 'Completed', 'Incomplete', 'Completed', 'In Progress', 'Completed'],\n",
+    "    'Assessment_Admin': ['Admin A', 'Admin B', 'Admin A', 'Admin B', 'Admin A', 'Admin A', 'Admin B', 'Admin A', 'Admin B', 'Admin A']\n",
+    "}\n",
+    "\n",
+    "# Create DataFrame\n",
+    "df = pd.DataFrame(data)\n",
+    "\n",
+    "# Save DataFrame to CSV\n",
+    "csv_file_path = 'dummy_company_asseement_data.csv'\n",
+    "df.to_csv(csv_file_path, index=False)\n",
+    "\n",
+    "print(f\"Dummy assessment data has been saved as {csv_file_path}.\")\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/run.py b/run.py
index 7d7cd4d..9ca3ee2 100644
--- a/run.py
+++ b/run.py
@@ -4,4 +4,4 @@ app = create_app()
 
 
 if __name__ == '__main__':
-    app.run(debug=True, port=5401)
+    app.run(debug=True, port=5402)
diff --git a/scripts/assessment_data.py b/scripts/assessment_data.py
new file mode 100644
index 0000000..104bc77
--- /dev/null
+++ b/scripts/assessment_data.py
@@ -0,0 +1,70 @@
+import pandas as pd
+
+def generate_summary_stats_v2(file_path):
+    # Load the DataFrame from the provided file path
+    df = pd.read_csv(file_path)
+    
+    # Ensure date columns are correctly parsed
+    df['Assessment_Start_Date'] = pd.to_datetime(df['Assessment_Start_Date'])
+    df['Assessment_End_Date'] = pd.to_datetime(df['Assessment_End_Date'])
+    
+    # Add completion rate calculation
+    completed_status = df['Assessment_Status'] == 'Completed'
+    completion_rate_by_frequency = df[completed_status].groupby('Assessment_Frequency').size() / df.groupby('Assessment_Frequency').size()
+    
+    in_progress_status = df['Assessment_Status'] == 'In Progress'
+    incomplete_status = df['Assessment_Status'] == 'Incomplete'
+    
+    # Calculate in-progress and incomplete rates by frequency
+    in_progress_rate_by_frequency = df[in_progress_status].groupby('Assessment_Frequency').size() / df.groupby('Assessment_Frequency').size()
+    incomplete_rate_by_frequency = df[incomplete_status].groupby('Assessment_Frequency').size() / df.groupby('Assessment_Frequency').size()
+    
+    # Fill NaN values (where no assessments are in-progress or incomplete for certain frequencies)
+    completion_rate_by_frequency = completion_rate_by_frequency.fillna(0)
+    in_progress_rate_by_frequency = in_progress_rate_by_frequency.fillna(0)
+    incomplete_rate_by_frequency = incomplete_rate_by_frequency.fillna(0)
+    
+    # Round all numerical values to 2 decimal places
+    completion_rate_by_frequency = completion_rate_by_frequency.round(2)
+    in_progress_rate_by_frequency = in_progress_rate_by_frequency.round(2)
+    incomplete_rate_by_frequency = incomplete_rate_by_frequency.round(2)
+
+    summary_stats = {
+        'Open Items and Red Flags': {
+            'Total Open Items': round(df['Open_Items'].sum(), 2),
+            'Average Open Items per Assessment': round(df['Open_Items'].mean(), 2),
+            'Total Red Flags': round(df['Red_Flags'].sum(), 2),
+            'Average Red Flags per Assessment': round(df['Red_Flags'].mean(), 2),
+            'Max Red Flags in a Single Assessment': round(df['Red_Flags'].max(), 2),
+            'Most Common Area with Red Flags': df[df['Red_Flags'] > 0]['Assessment_Area'].mode()[0]
+        },
+        'Assessment Frequency': {
+            'Assessment Type Breakdown': df['Assessment_Frequency'].value_counts(normalize=True).round(2).to_dict(),
+            'Average Time Between Assessments': round((df['Assessment_End_Date'] - df['Assessment_Start_Date']).dt.days.mean(), 2),
+            'Average Assessment Duration': round(df['Assessment_End_Date'].sub(df['Assessment_Start_Date']).dt.days.mean(), 2),
+            'Completion Rate by Frequency': completion_rate_by_frequency.to_dict(),
+            'In Progress Rate by Frequency': in_progress_rate_by_frequency.to_dict(),
+            'Incomplete Rate by Frequency': incomplete_rate_by_frequency.to_dict()
+        },
+        'Assessment Start and End Dates': {
+            'Longest Assessment Duration (days)': round(df['Assessment_End_Date'].sub(df['Assessment_Start_Date']).dt.days.max(), 2),
+            'Shortest Assessment Duration (days)': round(df['Assessment_End_Date'].sub(df['Assessment_Start_Date']).dt.days.min(), 2),
+        },
+        'Assessment Areas': {
+            'Most Assessed Area': df['Assessment_Area'].value_counts().idxmax(),
+            'Most Open Items in Area': df.groupby('Assessment_Area')['Open_Items'].sum().idxmax(),
+            'Area with Most Red Flags': df.groupby('Assessment_Area')['Red_Flags'].sum().idxmax()
+        },
+        'Assessment Status': {
+            'Assessment Status Distribution': df['Assessment_Status'].value_counts(normalize=True).round(2).to_dict(),
+            'Incomplete Assessments': round(df[df['Assessment_Status'] == 'Incomplete'].shape[0], 2),
+            'In Progress Assessments': round(df[df['Assessment_Status'] == 'In Progress'].shape[0], 2)
+        },
+        'Assessment Admin': {
+            'Most Frequent Admin': df['Assessment_Admin'].mode()[0],
+            'Admin with Fewest Red Flags': df.groupby('Assessment_Admin')['Red_Flags'].sum().idxmin(),
+            'Admin with Most Open Items': df.groupby('Assessment_Admin')['Open_Items'].mean().idxmax()
+        }
+    }
+    
+    return summary_stats
diff --git a/setup.py b/setup.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/api/routes/chatbot.py b/src/api/routes/chatbot.py
index 56b35d1..ab4c7d8 100644
--- a/src/api/routes/chatbot.py
+++ b/src/api/routes/chatbot.py
@@ -4,6 +4,7 @@ from werkzeug.utils import secure_filename
 from src.services.chatbot import Chatbot
 from src.utils.utils import delete_all_files_in_directory
 from src.utils.document_loader import load_document  
+from src.services.chatbot import Chatbot
 
 
 # Initialize the Blueprint
@@ -59,3 +60,64 @@ def validate_worker_document():
 
     except Exception as e:
         return jsonify({"error": "Internal Server Error", "message": str(e)}), 500
+
+
+@bot.route('/predict_next_n_assessments', methods=['POST'])
+def predict_next_n_assessments():
+    try:
+        # Retrieve JSON data from the request
+        data = request.get_json()
+        company_info = data.get('company_info')
+        companyid = data.get('companyid')
+        N = data.get('N')
+
+        if not company_info or not companyid or N is None:
+            return jsonify({"error": "Missing data", "message": "Company info, company ID, or N value not provided."}), 400
+
+        # Instantiate the chatbot service
+        chatbot = Chatbot()
+
+        # Call the prediction method
+        response = chatbot.predict_next_n_assessment(
+            company_info=company_info,
+            companyid=companyid,
+            N=N
+        )
+        if not response:
+            return jsonify({"error": "No predictions available", "message": "Prediction process failed."}), 400
+        
+        return jsonify({"predictions": response}), 200
+
+    except Exception as e:
+        return jsonify({"error": "Internal Server Error", "message": str(e)}), 500
+
+
+
+@bot.route('/use_bot_predict_assessments', methods=['POST'])
+def use_bot_predict_assessments():
+    try:
+        # Retrieve JSON data from the request
+        data = request.get_json()
+        company_info = data.get('company_info')
+        companyid = data.get('companyid')
+        query = data.get('query')
+
+        if not company_info or not companyid or query is None:
+            return jsonify({"error": "Missing data", "message": "Company info, company ID, or query value not provided."}), 400
+
+        # Instantiate the chatbot service
+        chatbot = Chatbot()
+
+        # Call the prediction method
+        response = chatbot.predict_based_on_past_assessment(
+            company_info=company_info,
+            companyid=companyid,
+            query=query
+        )
+        if not response:
+            return jsonify({"error": "No predictions available", "message": "Prediction process failed."}), 400
+        
+        return jsonify({"predictions": response}), 200
+
+    except Exception as e:
+        return jsonify({"error": "Internal Server Error", "message": str(e)}), 500
\ No newline at end of file
diff --git a/src/models/bot_response_schema.py b/src/models/bot_response_schema.py
index 22b0560..44feea6 100644
--- a/src/models/bot_response_schema.py
+++ b/src/models/bot_response_schema.py
@@ -4,4 +4,21 @@ from typing import List, Dict
 
 class ValidateWorker(BaseModel):
     result:str
-    
\ No newline at end of file
+    
+class Result(BaseModel):
+    response:str
+
+class Cases(BaseModel):
+    open_items: int
+    red_flags: int
+
+class AssessmentsFrequency(BaseModel):
+    weekly: Cases
+    biweekly: Cases
+    quarterly: Cases
+
+class AssessmentPrediction(BaseModel):
+    AssessmentN: AssessmentsFrequency
+
+class AssessmentPredictionsResponse(BaseModel):
+    predictions: List[AssessmentPrediction]
diff --git a/src/pipeline/data_preprocessor.py b/src/pipeline/data_preprocessor.py
index adccf94..c895ba5 100644
--- a/src/pipeline/data_preprocessor.py
+++ b/src/pipeline/data_preprocessor.py
@@ -4,11 +4,6 @@ import logging
 from logging.handlers import RotatingFileHandler
 
 
-handler = RotatingFileHandler('/root/ds_erp_ai/logs/prediction_pipeline.log', maxBytes=100000, backupCount=3)
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-logger.addHandler(handler)
-
 class DataPreprocessor:
     def __init__(self, input_path, company_id):
         self.input_path = input_path
diff --git a/src/pipeline/model_trainer.py b/src/pipeline/model_trainer.py
index dd7b769..c7f70cc 100644
--- a/src/pipeline/model_trainer.py
+++ b/src/pipeline/model_trainer.py
@@ -33,7 +33,8 @@ class ModelTrainer:
         y = self.df[['open_items', 'red_flags']]  # Multi-target for open items and red flags
 
         # Split into training and test sets with 10% as test size
-        X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+        X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=False)
+
 
         # Train the model
         self.model.fit(X_train, y_train)
diff --git a/src/prompts/chatbot.py b/src/prompts/chatbot.py
index 847f3ee..b0fed5e 100644
--- a/src/prompts/chatbot.py
+++ b/src/prompts/chatbot.py
@@ -1,3 +1,5 @@
+
+
 def validate_worker_prompt() -> str:
    return """
     You are a worker in the company "Validate" where you are asked a specific yes or no question:
@@ -15,4 +17,133 @@ def validate_worker_prompt() -> str:
       result:"validated" 
     }
     """
-    
\ No newline at end of file
+def predict_based_past_assessment_prompt(query,company_info, summary_stats):
+    # Extract company information from the dictionary
+    company_name = company_info['company_name']
+    company_size = company_info['company_size']
+    departments = company_info['departments']
+    
+    # Create the prompt with the provided company info and summary statistics
+    prompt = f"""
+    **Prompt for the Chatbot:**
+
+    **Context:**
+    You are an AI assistant working for {company_name}, and your primary responsibility is to provide **insights**, **predictions**, and **recommendations** based on the company's past assessment data and organizational structure. You are not allowed to respond to any queries outside of this domain.
+
+    **General Company Information:**
+    - **Company Name**: {company_name}
+    - **Company Size**: {company_size} (e.g., Small, Medium, Large)
+    - **Departments**: 
+      {', '.join(departments)}
+
+    **Assessment Summary**:
+    The following is a detailed summary of past assessments at {company_name}. Use this information to provide predictions and recommendations based on trends and data points.
+    
+    - **Open Items and Red Flags**:
+      - Total Open Items: {summary_stats['Open Items and Red Flags']['Total Open Items']}
+      - Average Open Items per Assessment: {summary_stats['Open Items and Red Flags']['Average Open Items per Assessment']}
+      - Total Red Flags: {summary_stats['Open Items and Red Flags']['Total Red Flags']}
+      - Average Red Flags per Assessment: {summary_stats['Open Items and Red Flags']['Average Red Flags per Assessment']}
+      - Max Red Flags in a Single Assessment: {summary_stats['Open Items and Red Flags']['Max Red Flags in a Single Assessment']}
+      - Most Common Area with Red Flags: {summary_stats['Open Items and Red Flags']['Most Common Area with Red Flags']}
+    
+    - **Assessment Frequency**:
+      - Weekly: {summary_stats['Assessment Frequency']['Assessment Type Breakdown'].get('Weekly', 0) * 100}%
+      - Bi-Weekly: {summary_stats['Assessment Frequency']['Assessment Type Breakdown'].get('Bi-Weekly', 0) * 100}%
+      - Quarterly: {summary_stats['Assessment Frequency']['Assessment Type Breakdown'].get('Quarterly', 0) * 100}%
+      - Average Time Between Assessments: {summary_stats['Assessment Frequency']['Average Time Between Assessments']} days
+      - Average Assessment Duration: {summary_stats['Assessment Frequency']['Average Assessment Duration']} days
+
+    - **Assessment Start and End Dates**:
+      - Longest Assessment Duration: {summary_stats['Assessment Start and End Dates']['Longest Assessment Duration (days)']} days
+      - Shortest Assessment Duration: {summary_stats['Assessment Start and End Dates']['Shortest Assessment Duration (days)']} days
+
+    - **Assessment Areas**:
+      - Most Assessed Area: {summary_stats['Assessment Areas']['Most Assessed Area']}
+      - Most Open Items in Area: {summary_stats['Assessment Areas']['Most Open Items in Area']}
+      - Area with Most Red Flags: {summary_stats['Assessment Areas']['Area with Most Red Flags']}
+
+    - **Assessment Status**:
+      - Completed: {summary_stats['Assessment Status']['Assessment Status Distribution'].get('Completed', 0) * 100}%
+      - In Progress: {summary_stats['Assessment Status']['Assessment Status Distribution'].get('In Progress', 0) * 100}%
+      - Incomplete: {summary_stats['Assessment Status']['Assessment Status Distribution'].get('Incomplete', 0) * 100}%
+
+    - **Assessment Admin**:
+      - Most Frequent Admin: {summary_stats['Assessment Admin']['Most Frequent Admin']}
+      - Admin with Fewest Red Flags: {summary_stats['Assessment Admin']['Admin with Fewest Red Flags']}
+      - Admin with Most Open Items: {summary_stats['Assessment Admin']['Admin with Most Open Items']}
+
+    **Instructions:**
+    Use the above information to answer user queries. You should:
+    - Analyze historical data to identify trends and problem areas.
+    - Predict potential outcomes for future assessments based on past performance (e.g., meeting deadlines, reducing red flags).
+    - Provide **actionable recommendations** that can help improve performance in future assessments.
+
+    **User Query**:
+    "{query}"
+
+    **Your Response**:
+    Predict and provide recommendations based on the company’s historical data, focusing on the areas most relevant to the query. Ensure the response is based on past trends and performance issues.
+
+    **Examples of Insightful Responses**:
+    - "To improve your performance in the next assessment, you should focus on reducing red flags in the Communication department, as it has had the most issues."
+    - "Based on the company's past performance, there is a 70% chance that you will meet the deadline for the next weekly assessment. To ensure success, focus on completing open items in the IT department."
+    - "The data indicates that quarterly assessments have the highest rate of incomplete tasks. I recommend prioritizing quarterly assessment tasks to avoid falling behind."
+    """
+    
+    return prompt
+
+
+
+def predict_next_n_assessments_prompt():
+    
+    # Create the prompt with provided company info, summary statistics, and number of assessments (n)
+    prompt = """
+    **Prompt for the Chatbot:**
+
+    **Context:**
+    You are an AI assistant responsible for analyzing the past assessment data of , and your primary responsibility is to provide **predictions** for the next {n} assessments. 
+    These assessments can occur on a **weekly**, **bi-weekly**, or **quarterly** basis. Use the company's past performance to predict the following for each of the next {n} assessments:
+    - **Number of Open Items**.
+    - **Number of Red Flags**.
+    - **Predictions for Weekly, Bi-Weekly, and Quarterly assessments**.
+    input : 
+    - company basic info
+    - past assessment statitics
+    - N - number of next assessments to be predicted 
+    **General Company Information:**
+    
+
+    **Assessment Summary (Past Data)**:
+    The Detailed information on past asssessment will be provided. Use this information to make predictions for the next {n} assessments.
+
+
+    **Instructions**:
+    - Predict the number of open items and red flags for the next n assessments if they are conducted on a weekly, bi-weekly, or quarterly basis.
+    - Use the historical summary statistics provided above to guide your predictions.
+    - Return the response in the following JSON format:
+
+    **Response Format**:
+
+    {
+      "assessment 1": [
+        {
+          "weekly": {"open_items": X, "red_flags": Y}},
+          "biweekly": {{"open_items": X, "red_flags": Y}},
+          "quarterly": {{"open_items": X, "red_flags": Y}}
+        }
+      ],
+      "assessment 2": [
+        {
+          "weekly": {"open_items": X, "red_flags": Y},
+          "biweekly": {"open_items": X, "red_flags": Y},
+          "quarterly": {"open_items": X, "red_flags": Y}
+        }
+      ]
+      // assuming N is 2
+    }
+    ```
+    Ensure each assessment is provided with three predictions: one for Weekly, one for Bi-Weekly, and one for Quarterly assessments.
+    """
+    
+    return prompt
diff --git a/src/services/chatbot.py b/src/services/chatbot.py
index d9cffbd..493dba8 100644
--- a/src/services/chatbot.py
+++ b/src/services/chatbot.py
@@ -7,6 +7,7 @@ from src.prompts.sops import *
 from src.prompts.chatbot import *
 from src.models.sop_response_schemas import *
 from src.models.bot_response_schema import *
+from scripts.assessment_data import generate_summary_stats_v2
 from dotenv import load_dotenv
 load_dotenv()
 
@@ -52,7 +53,7 @@ class Chatbot:
                     }
                 ],
                 response_format=ValidateWorker,
-                max_tokens=4096,
+                max_tokens=1024,
                 temperature=0.1
             )
 
@@ -64,3 +65,121 @@ class Chatbot:
         except Exception as e:
             print(f"An error occurred: {e}")
             return None
+        
+    
+
+
+    def predict_based_on_past_assessment(self, query, company_info, companyid) -> Result:
+        """
+        This method generates predictions based on past assessment data of a company. It queries the backend for the 
+        company's assessment data, generates a prompt, and then uses the GPT-4 model to return predictions based on the query.
+
+        :param query: The question or query asked by the user.
+        :param company_info: General information about the company (name, size, departments, etc.).
+        :param companyid: Unique identifier of the company to fetch its specific data.
+        :return: Result containing the prediction result or None if an error occurs.
+        """
+        try:
+            # Define the path to the company's assessment data (stored as a CSV)
+            data_path = os.path.join('data', 'raw', 'erp_company_assessment', f'{companyid}_raw_data.csv')
+            
+            # Generate summary statistics from the company's assessment data
+            summary_stats = generate_summary_stats_v2(file_path=data_path)
+           
+
+            # Generate the prompt using the company info and the summary statistics
+            prompt = predict_based_past_assessment_prompt(
+                query=query,
+                company_info=company_info,
+                summary_stats=summary_stats
+            )
+   
+            # Interact with GPT-4 model to get a response
+            response = self.client.beta.chat.completions.parse(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": f"{prompt}"
+                    },
+                    {
+                        "role": "user",
+                        "content": f"{query}",
+                    }
+                ],
+                response_format=Result,
+                max_tokens=1024,
+                temperature=0.1
+            )
+
+            # Extract and return the response from the GPT-4 model
+            extracted_text = json.loads(response.choices[0].message.content)
+
+            return extracted_text
+
+        except Exception as e:
+            print(f"An error occurred: {e}")
+            return None
+
+    
+    def predict_next_n_assessment(self, company_info, companyid, N) -> AssessmentPredictionsResponse:
+        """
+        This method generates predictions based on past assessment data of a company. It queries the backend for the 
+        company's assessment data, generates a prompt, and then uses the GPT-4 model to return predictions based on the query.
+
+        :param query: The question or query asked by the user.
+        :param company_info: General information about the company (name, size, departments, etc.).
+        :param companyid: Unique identifier of the company to fetch its specific data.
+        :param N: Number of assessments to predict.
+        :return: Result containing the prediction result or None if an error occurs.
+        """
+        try:
+            # Define the path to the company's assessment data (stored as a CSV)
+            data_path = os.path.join('data', 'raw', 'erp_company_assessment', f'{companyid}_raw_data.csv')
+            
+            # Generate summary statistics from the company's assessment data
+            summary_stats = generate_summary_stats_v2(file_path=data_path)
+
+            # Generate the prompt using the company info and the summary statistics
+            prompt = predict_next_n_assessments_prompt()
+          
+            
+            # Interact with GPT-4 model to get a response
+            response = self.client.beta.chat.completions.parse(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": f"{prompt}"
+                    },
+                    {
+                        "role": "user",
+                        "content": f"company info: {company_info}--> N-value is {N} ",
+                    },
+                    {
+                        "role": "user",
+                        "content": f"Summary stats: {summary_stats}",
+                    }
+                ],
+                response_format=AssessmentPredictionsResponse,
+                max_tokens=1024,
+                temperature=0.1
+            )
+
+            # Extract the response from the GPT-4 model
+            extracted_text = json.loads(response.choices[0].message.content)
+
+            # Initialize dictionary to store assessments with dynamic names
+            predictions = {}
+
+            # Loop through the predicted assessments and rename them dynamically
+            for i in range(N):
+                assessment_key = f"assessment_{i + 1}"
+                predictions[assessment_key] = extracted_text["predictions"][i]['AssessmentN']
+
+            # Return the dynamically named assessments
+            return predictions
+
+        except Exception as e:
+            print(f"An error occurred: {e}")
+            return None
diff --git a/test.py b/test.py
index 11fbc78..09ca595 100644
--- a/test.py
+++ b/test.py
@@ -6,9 +6,7 @@ input_base_path = '/root/ds_erp_ai/data/raw/erp_assessment_prediction'  # The ba
 pipeline = CompanyModelPipeline(company_ids=company_ids, input_base_path=input_base_path)
 pipeline.run_pipeline()'''
 
-from src.pipeline.inference import AssessmentInference
-
-
+'''from src.pipeline.inference import AssessmentInference
 
 inference = AssessmentInference(
     company_id="testid",num_assessments=2
@@ -18,3 +16,32 @@ result = inference.run()
 
 
 print(result)
+'''
+'''
+response2 = bot.predict_next_n_assessment(
+    company_info=company_info,
+    companyid="testid",
+    N=3
+)
+
+print(f"Predictions {response2}")
+'''
+
+from src.services.chatbot import Chatbot
+company_info = {
+    'company_name': "ABC Corp",
+    'company_size': "Medium",  # Can be "Small", "Medium", or "Large"
+    'departments': ["Sales", "Marketing", "IT", "Finance", "HR", "Logistics"]
+}
+bot = Chatbot()
+response = bot.predict_based_on_past_assessment(
+    query="Should i make my next assessment weekly or biweekly to meet up to deadline?",
+    company_info=company_info,
+    companyid="testid"
+)
+
+print(f"Result: {response}")
+
+
+
+