From cd8f499f977dc371da5860c2dc94e3052b6e0dcb Mon Sep 17 00:00:00 2001 From: kowshik Date: Sat, 14 Sep 2024 01:50:41 +0000 Subject: [PATCH] added bot prediction for assessments --- .../testid_raw_data.2csv | 11 + .../testid_raw_data.csv | 11 + notebooks/dummy_assessment_data.csv | 11 + notebooks/test_prediction_pipeline.ipynb | 239 +++++++++++++++++- run.py | 2 +- scripts/assessment_data.py | 70 +++++ setup.py | 0 src/api/routes/chatbot.py | 62 +++++ src/models/bot_response_schema.py | 19 +- src/pipeline/data_preprocessor.py | 5 - src/pipeline/model_trainer.py | 3 +- src/prompts/chatbot.py | 133 +++++++++- src/services/chatbot.py | 121 ++++++++- test.py | 33 ++- 14 files changed, 698 insertions(+), 22 deletions(-) create mode 100644 data/raw/erp_company_assessment/testid_raw_data.2csv create mode 100644 data/raw/erp_company_assessment/testid_raw_data.csv create mode 100644 notebooks/dummy_assessment_data.csv create mode 100644 scripts/assessment_data.py delete mode 100644 setup.py diff --git a/data/raw/erp_company_assessment/testid_raw_data.2csv b/data/raw/erp_company_assessment/testid_raw_data.2csv new file mode 100644 index 0000000..1cba78e --- /dev/null +++ b/data/raw/erp_company_assessment/testid_raw_data.2csv @@ -0,0 +1,11 @@ +Assessment_ID,Open_Items,Red_Flags,Assessment_Frequency,Assessment_Start_Date,Assessment_End_Date,Assessment_Area,Assessment_Status,Assessment_Admin +1,3,1,Weekly,2023-01-01,2023-01-07,Deployment,Completed,Admin A +2,4,2,Bi-Weekly,2023-01-16,2023-01-22,Communication,Completed,Admin B +3,2,0,Weekly,2023-01-31,2023-02-06,Deployment,Completed,Admin A +4,5,1,Quarterly,2023-02-15,2023-02-21,Communication,In Progress,Admin B +5,1,0,Bi-Weekly,2023-03-02,2023-03-08,Deployment,Completed,Admin A +6,3,3,Weekly,2023-03-17,2023-03-23,Deployment,Completed,Admin A +7,2,2,Quarterly,2023-04-01,2023-04-07,Communication,Incomplete,Admin B +8,4,1,Bi-Weekly,2023-04-16,2023-04-22,Deployment,Completed,Admin A +9,5,1,Weekly,2023-05-01,2023-05-07,Communication,In Progress,Admin B +10,3,2,Quarterly,2023-05-16,2023-05-22,Deployment,Completed,Admin A diff --git a/data/raw/erp_company_assessment/testid_raw_data.csv b/data/raw/erp_company_assessment/testid_raw_data.csv new file mode 100644 index 0000000..77da1b3 --- /dev/null +++ b/data/raw/erp_company_assessment/testid_raw_data.csv @@ -0,0 +1,11 @@ +Assessment_ID,Open_Items,Red_Flags,Assessment_Frequency,Assessment_Start_Date,Assessment_End_Date,Assessment_Area,Assessment_Status,Assessment_Admin,Department +1,3,1,Weekly,2023-01-01,2023-01-07,Deployment,Completed,Admin A,IT +2,4,2,Bi-Weekly,2023-01-16,2023-01-22,Communication,Completed,Admin B,HR +3,2,0,Weekly,2023-01-31,2023-02-06,Deployment,Completed,Admin A,Finance +4,5,1,Quarterly,2023-02-15,2023-02-21,Communication,In Progress,Admin B,IT +5,1,0,Bi-Weekly,2023-03-02,2023-03-08,Deployment,Completed,Admin A,HR +6,3,3,Weekly,2023-03-17,2023-03-23,Deployment,Completed,Admin A,Finance +7,2,2,Quarterly,2023-04-01,2023-04-07,Communication,Incomplete,Admin B,IT +8,4,1,Bi-Weekly,2023-04-16,2023-04-22,Deployment,Completed,Admin A,HR +9,5,1,Weekly,2023-05-01,2023-05-07,Communication,In Progress,Admin B,Finance +10,3,2,Quarterly,2023-05-16,2023-05-22,Deployment,Completed,Admin A,IT diff --git a/notebooks/dummy_assessment_data.csv b/notebooks/dummy_assessment_data.csv new file mode 100644 index 0000000..1cba78e --- /dev/null +++ b/notebooks/dummy_assessment_data.csv @@ -0,0 +1,11 @@ +Assessment_ID,Open_Items,Red_Flags,Assessment_Frequency,Assessment_Start_Date,Assessment_End_Date,Assessment_Area,Assessment_Status,Assessment_Admin +1,3,1,Weekly,2023-01-01,2023-01-07,Deployment,Completed,Admin A +2,4,2,Bi-Weekly,2023-01-16,2023-01-22,Communication,Completed,Admin B +3,2,0,Weekly,2023-01-31,2023-02-06,Deployment,Completed,Admin A +4,5,1,Quarterly,2023-02-15,2023-02-21,Communication,In Progress,Admin B +5,1,0,Bi-Weekly,2023-03-02,2023-03-08,Deployment,Completed,Admin A +6,3,3,Weekly,2023-03-17,2023-03-23,Deployment,Completed,Admin A +7,2,2,Quarterly,2023-04-01,2023-04-07,Communication,Incomplete,Admin B +8,4,1,Bi-Weekly,2023-04-16,2023-04-22,Deployment,Completed,Admin A +9,5,1,Weekly,2023-05-01,2023-05-07,Communication,In Progress,Admin B +10,3,2,Quarterly,2023-05-16,2023-05-22,Deployment,Completed,Admin A diff --git a/notebooks/test_prediction_pipeline.ipynb b/notebooks/test_prediction_pipeline.ipynb index 50a7fab..01c4c39 100644 --- a/notebooks/test_prediction_pipeline.ipynb +++ b/notebooks/test_prediction_pipeline.ipynb @@ -2,22 +2,204 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# Create a dummy dataset with past 5 assessments\n", "import pandas as pd\n", + "\n", "data_dummy = {\n", - " 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n", - " 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n", - " 'open_items': [10, 12, 11, 9, 13],\n", - " 'red_flags': [2, 1, 3, 1, 4],\n", - " 'num_employees': [30, 25, 28, 30, 27],\n", - " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n", + " 'start_date': pd.date_range(start='2023-01-01', periods=12, freq='7D'),\n", + " 'end_date': pd.date_range(start='2023-01-02', periods=12, freq='7D'),\n", + " 'open_items': [10, 12, 11, 9, 13, 14, 15, 16, 12, 11, 10, 9],\n", + " 'red_flags': [2, 1, 3, 1, 4, 2, 1, 3, 2, 1, 4, 3],\n", + " 'num_employees': [30, 25, 28, 30, 27, 26, 31, 29, 25, 30, 27, 26],\n", + " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly', \n", + " 'weekly', 'quarterly', 'biweekly', 'weekly', 'quarterly', 'weekly', 'biweekly']\n", "}\n", "\n", - "df_dummy = pd.DataFrame(data_dummy)" + "df_dummy = pd.DataFrame(data_dummy)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateopen_itemsred_flagsnum_employeesassessment_type
02023-01-012023-01-0210230weekly
12023-01-082023-01-0912125biweekly
22023-01-152023-01-1611328quarterly
32023-01-222023-01-239130weekly
42023-01-292023-01-3013427biweekly
52023-02-052023-02-0614226weekly
62023-02-122023-02-1315131quarterly
72023-02-192023-02-2016329biweekly
82023-02-262023-02-2712225weekly
92023-03-052023-03-0611130quarterly
102023-03-122023-03-1310427weekly
112023-03-192023-03-209326biweekly
\n", + "
" + ], + "text/plain": [ + " start_date end_date open_items red_flags num_employees assessment_type\n", + "0 2023-01-01 2023-01-02 10 2 30 weekly\n", + "1 2023-01-08 2023-01-09 12 1 25 biweekly\n", + "2 2023-01-15 2023-01-16 11 3 28 quarterly\n", + "3 2023-01-22 2023-01-23 9 1 30 weekly\n", + "4 2023-01-29 2023-01-30 13 4 27 biweekly\n", + "5 2023-02-05 2023-02-06 14 2 26 weekly\n", + "6 2023-02-12 2023-02-13 15 1 31 quarterly\n", + "7 2023-02-19 2023-02-20 16 3 29 biweekly\n", + "8 2023-02-26 2023-02-27 12 2 25 weekly\n", + "9 2023-03-05 2023-03-06 11 1 30 quarterly\n", + "10 2023-03-12 2023-03-13 10 4 27 weekly\n", + "11 2023-03-19 2023-03-20 9 3 26 biweekly" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_dummy" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df_dummy.to_csv(\"test_data.csv\",index=False)" ] }, { @@ -1399,6 +1581,45 @@ "metadata": {}, "source": [] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dummy assessment data has been saved as dummy_company_asseement_data.csv.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Create dummy assessment data\n", + "data = {\n", + " 'Assessment_ID': range(1, 11),\n", + " 'Open_Items': [3, 4, 2, 5, 1, 3, 2, 4, 5, 3],\n", + " 'Red_Flags': [1, 2, 0, 1, 0, 3, 2, 1, 1, 2],\n", + " 'Assessment_Frequency': ['Weekly', 'Bi-Weekly', 'Weekly', 'Quarterly', 'Bi-Weekly', 'Weekly', 'Quarterly', 'Bi-Weekly', 'Weekly', 'Quarterly'],\n", + " 'Assessment_Start_Date': pd.date_range(start='2023-01-01', periods=10, freq='15D'),\n", + " 'Assessment_End_Date': pd.date_range(start='2023-01-07', periods=10, freq='15D'),\n", + " 'Assessment_Area': ['Deployment', 'Communication', 'Deployment', 'Communication', 'Deployment', 'Deployment', 'Communication', 'Deployment', 'Communication', 'Deployment'],\n", + " 'Assessment_Status': ['Completed', 'Completed', 'Completed', 'In Progress', 'Completed', 'Completed', 'Incomplete', 'Completed', 'In Progress', 'Completed'],\n", + " 'Assessment_Admin': ['Admin A', 'Admin B', 'Admin A', 'Admin B', 'Admin A', 'Admin A', 'Admin B', 'Admin A', 'Admin B', 'Admin A']\n", + "}\n", + "\n", + "# Create DataFrame\n", + "df = pd.DataFrame(data)\n", + "\n", + "# Save DataFrame to CSV\n", + "csv_file_path = 'dummy_company_asseement_data.csv'\n", + "df.to_csv(csv_file_path, index=False)\n", + "\n", + "print(f\"Dummy assessment data has been saved as {csv_file_path}.\")\n" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/run.py b/run.py index 7d7cd4d..9ca3ee2 100644 --- a/run.py +++ b/run.py @@ -4,4 +4,4 @@ app = create_app() if __name__ == '__main__': - app.run(debug=True, port=5401) + app.run(debug=True, port=5402) diff --git a/scripts/assessment_data.py b/scripts/assessment_data.py new file mode 100644 index 0000000..104bc77 --- /dev/null +++ b/scripts/assessment_data.py @@ -0,0 +1,70 @@ +import pandas as pd + +def generate_summary_stats_v2(file_path): + # Load the DataFrame from the provided file path + df = pd.read_csv(file_path) + + # Ensure date columns are correctly parsed + df['Assessment_Start_Date'] = pd.to_datetime(df['Assessment_Start_Date']) + df['Assessment_End_Date'] = pd.to_datetime(df['Assessment_End_Date']) + + # Add completion rate calculation + completed_status = df['Assessment_Status'] == 'Completed' + completion_rate_by_frequency = df[completed_status].groupby('Assessment_Frequency').size() / df.groupby('Assessment_Frequency').size() + + in_progress_status = df['Assessment_Status'] == 'In Progress' + incomplete_status = df['Assessment_Status'] == 'Incomplete' + + # Calculate in-progress and incomplete rates by frequency + in_progress_rate_by_frequency = df[in_progress_status].groupby('Assessment_Frequency').size() / df.groupby('Assessment_Frequency').size() + incomplete_rate_by_frequency = df[incomplete_status].groupby('Assessment_Frequency').size() / df.groupby('Assessment_Frequency').size() + + # Fill NaN values (where no assessments are in-progress or incomplete for certain frequencies) + completion_rate_by_frequency = completion_rate_by_frequency.fillna(0) + in_progress_rate_by_frequency = in_progress_rate_by_frequency.fillna(0) + incomplete_rate_by_frequency = incomplete_rate_by_frequency.fillna(0) + + # Round all numerical values to 2 decimal places + completion_rate_by_frequency = completion_rate_by_frequency.round(2) + in_progress_rate_by_frequency = in_progress_rate_by_frequency.round(2) + incomplete_rate_by_frequency = incomplete_rate_by_frequency.round(2) + + summary_stats = { + 'Open Items and Red Flags': { + 'Total Open Items': round(df['Open_Items'].sum(), 2), + 'Average Open Items per Assessment': round(df['Open_Items'].mean(), 2), + 'Total Red Flags': round(df['Red_Flags'].sum(), 2), + 'Average Red Flags per Assessment': round(df['Red_Flags'].mean(), 2), + 'Max Red Flags in a Single Assessment': round(df['Red_Flags'].max(), 2), + 'Most Common Area with Red Flags': df[df['Red_Flags'] > 0]['Assessment_Area'].mode()[0] + }, + 'Assessment Frequency': { + 'Assessment Type Breakdown': df['Assessment_Frequency'].value_counts(normalize=True).round(2).to_dict(), + 'Average Time Between Assessments': round((df['Assessment_End_Date'] - df['Assessment_Start_Date']).dt.days.mean(), 2), + 'Average Assessment Duration': round(df['Assessment_End_Date'].sub(df['Assessment_Start_Date']).dt.days.mean(), 2), + 'Completion Rate by Frequency': completion_rate_by_frequency.to_dict(), + 'In Progress Rate by Frequency': in_progress_rate_by_frequency.to_dict(), + 'Incomplete Rate by Frequency': incomplete_rate_by_frequency.to_dict() + }, + 'Assessment Start and End Dates': { + 'Longest Assessment Duration (days)': round(df['Assessment_End_Date'].sub(df['Assessment_Start_Date']).dt.days.max(), 2), + 'Shortest Assessment Duration (days)': round(df['Assessment_End_Date'].sub(df['Assessment_Start_Date']).dt.days.min(), 2), + }, + 'Assessment Areas': { + 'Most Assessed Area': df['Assessment_Area'].value_counts().idxmax(), + 'Most Open Items in Area': df.groupby('Assessment_Area')['Open_Items'].sum().idxmax(), + 'Area with Most Red Flags': df.groupby('Assessment_Area')['Red_Flags'].sum().idxmax() + }, + 'Assessment Status': { + 'Assessment Status Distribution': df['Assessment_Status'].value_counts(normalize=True).round(2).to_dict(), + 'Incomplete Assessments': round(df[df['Assessment_Status'] == 'Incomplete'].shape[0], 2), + 'In Progress Assessments': round(df[df['Assessment_Status'] == 'In Progress'].shape[0], 2) + }, + 'Assessment Admin': { + 'Most Frequent Admin': df['Assessment_Admin'].mode()[0], + 'Admin with Fewest Red Flags': df.groupby('Assessment_Admin')['Red_Flags'].sum().idxmin(), + 'Admin with Most Open Items': df.groupby('Assessment_Admin')['Open_Items'].mean().idxmax() + } + } + + return summary_stats diff --git a/setup.py b/setup.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/api/routes/chatbot.py b/src/api/routes/chatbot.py index 56b35d1..ab4c7d8 100644 --- a/src/api/routes/chatbot.py +++ b/src/api/routes/chatbot.py @@ -4,6 +4,7 @@ from werkzeug.utils import secure_filename from src.services.chatbot import Chatbot from src.utils.utils import delete_all_files_in_directory from src.utils.document_loader import load_document +from src.services.chatbot import Chatbot # Initialize the Blueprint @@ -59,3 +60,64 @@ def validate_worker_document(): except Exception as e: return jsonify({"error": "Internal Server Error", "message": str(e)}), 500 + + +@bot.route('/predict_next_n_assessments', methods=['POST']) +def predict_next_n_assessments(): + try: + # Retrieve JSON data from the request + data = request.get_json() + company_info = data.get('company_info') + companyid = data.get('companyid') + N = data.get('N') + + if not company_info or not companyid or N is None: + return jsonify({"error": "Missing data", "message": "Company info, company ID, or N value not provided."}), 400 + + # Instantiate the chatbot service + chatbot = Chatbot() + + # Call the prediction method + response = chatbot.predict_next_n_assessment( + company_info=company_info, + companyid=companyid, + N=N + ) + if not response: + return jsonify({"error": "No predictions available", "message": "Prediction process failed."}), 400 + + return jsonify({"predictions": response}), 200 + + except Exception as e: + return jsonify({"error": "Internal Server Error", "message": str(e)}), 500 + + + +@bot.route('/use_bot_predict_assessments', methods=['POST']) +def use_bot_predict_assessments(): + try: + # Retrieve JSON data from the request + data = request.get_json() + company_info = data.get('company_info') + companyid = data.get('companyid') + query = data.get('query') + + if not company_info or not companyid or query is None: + return jsonify({"error": "Missing data", "message": "Company info, company ID, or query value not provided."}), 400 + + # Instantiate the chatbot service + chatbot = Chatbot() + + # Call the prediction method + response = chatbot.predict_based_on_past_assessment( + company_info=company_info, + companyid=companyid, + query=query + ) + if not response: + return jsonify({"error": "No predictions available", "message": "Prediction process failed."}), 400 + + return jsonify({"predictions": response}), 200 + + except Exception as e: + return jsonify({"error": "Internal Server Error", "message": str(e)}), 500 \ No newline at end of file diff --git a/src/models/bot_response_schema.py b/src/models/bot_response_schema.py index 22b0560..44feea6 100644 --- a/src/models/bot_response_schema.py +++ b/src/models/bot_response_schema.py @@ -4,4 +4,21 @@ from typing import List, Dict class ValidateWorker(BaseModel): result:str - \ No newline at end of file + +class Result(BaseModel): + response:str + +class Cases(BaseModel): + open_items: int + red_flags: int + +class AssessmentsFrequency(BaseModel): + weekly: Cases + biweekly: Cases + quarterly: Cases + +class AssessmentPrediction(BaseModel): + AssessmentN: AssessmentsFrequency + +class AssessmentPredictionsResponse(BaseModel): + predictions: List[AssessmentPrediction] diff --git a/src/pipeline/data_preprocessor.py b/src/pipeline/data_preprocessor.py index adccf94..c895ba5 100644 --- a/src/pipeline/data_preprocessor.py +++ b/src/pipeline/data_preprocessor.py @@ -4,11 +4,6 @@ import logging from logging.handlers import RotatingFileHandler -handler = RotatingFileHandler('/root/ds_erp_ai/logs/prediction_pipeline.log', maxBytes=100000, backupCount=3) -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -logger.addHandler(handler) - class DataPreprocessor: def __init__(self, input_path, company_id): self.input_path = input_path diff --git a/src/pipeline/model_trainer.py b/src/pipeline/model_trainer.py index dd7b769..c7f70cc 100644 --- a/src/pipeline/model_trainer.py +++ b/src/pipeline/model_trainer.py @@ -33,7 +33,8 @@ class ModelTrainer: y = self.df[['open_items', 'red_flags']] # Multi-target for open items and red flags # Split into training and test sets with 10% as test size - X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.1, random_state=42) + X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=False) + # Train the model self.model.fit(X_train, y_train) diff --git a/src/prompts/chatbot.py b/src/prompts/chatbot.py index 847f3ee..b0fed5e 100644 --- a/src/prompts/chatbot.py +++ b/src/prompts/chatbot.py @@ -1,3 +1,5 @@ + + def validate_worker_prompt() -> str: return """ You are a worker in the company "Validate" where you are asked a specific yes or no question: @@ -15,4 +17,133 @@ def validate_worker_prompt() -> str: result:"validated" } """ - \ No newline at end of file +def predict_based_past_assessment_prompt(query,company_info, summary_stats): + # Extract company information from the dictionary + company_name = company_info['company_name'] + company_size = company_info['company_size'] + departments = company_info['departments'] + + # Create the prompt with the provided company info and summary statistics + prompt = f""" + **Prompt for the Chatbot:** + + **Context:** + You are an AI assistant working for {company_name}, and your primary responsibility is to provide **insights**, **predictions**, and **recommendations** based on the company's past assessment data and organizational structure. You are not allowed to respond to any queries outside of this domain. + + **General Company Information:** + - **Company Name**: {company_name} + - **Company Size**: {company_size} (e.g., Small, Medium, Large) + - **Departments**: + {', '.join(departments)} + + **Assessment Summary**: + The following is a detailed summary of past assessments at {company_name}. Use this information to provide predictions and recommendations based on trends and data points. + + - **Open Items and Red Flags**: + - Total Open Items: {summary_stats['Open Items and Red Flags']['Total Open Items']} + - Average Open Items per Assessment: {summary_stats['Open Items and Red Flags']['Average Open Items per Assessment']} + - Total Red Flags: {summary_stats['Open Items and Red Flags']['Total Red Flags']} + - Average Red Flags per Assessment: {summary_stats['Open Items and Red Flags']['Average Red Flags per Assessment']} + - Max Red Flags in a Single Assessment: {summary_stats['Open Items and Red Flags']['Max Red Flags in a Single Assessment']} + - Most Common Area with Red Flags: {summary_stats['Open Items and Red Flags']['Most Common Area with Red Flags']} + + - **Assessment Frequency**: + - Weekly: {summary_stats['Assessment Frequency']['Assessment Type Breakdown'].get('Weekly', 0) * 100}% + - Bi-Weekly: {summary_stats['Assessment Frequency']['Assessment Type Breakdown'].get('Bi-Weekly', 0) * 100}% + - Quarterly: {summary_stats['Assessment Frequency']['Assessment Type Breakdown'].get('Quarterly', 0) * 100}% + - Average Time Between Assessments: {summary_stats['Assessment Frequency']['Average Time Between Assessments']} days + - Average Assessment Duration: {summary_stats['Assessment Frequency']['Average Assessment Duration']} days + + - **Assessment Start and End Dates**: + - Longest Assessment Duration: {summary_stats['Assessment Start and End Dates']['Longest Assessment Duration (days)']} days + - Shortest Assessment Duration: {summary_stats['Assessment Start and End Dates']['Shortest Assessment Duration (days)']} days + + - **Assessment Areas**: + - Most Assessed Area: {summary_stats['Assessment Areas']['Most Assessed Area']} + - Most Open Items in Area: {summary_stats['Assessment Areas']['Most Open Items in Area']} + - Area with Most Red Flags: {summary_stats['Assessment Areas']['Area with Most Red Flags']} + + - **Assessment Status**: + - Completed: {summary_stats['Assessment Status']['Assessment Status Distribution'].get('Completed', 0) * 100}% + - In Progress: {summary_stats['Assessment Status']['Assessment Status Distribution'].get('In Progress', 0) * 100}% + - Incomplete: {summary_stats['Assessment Status']['Assessment Status Distribution'].get('Incomplete', 0) * 100}% + + - **Assessment Admin**: + - Most Frequent Admin: {summary_stats['Assessment Admin']['Most Frequent Admin']} + - Admin with Fewest Red Flags: {summary_stats['Assessment Admin']['Admin with Fewest Red Flags']} + - Admin with Most Open Items: {summary_stats['Assessment Admin']['Admin with Most Open Items']} + + **Instructions:** + Use the above information to answer user queries. You should: + - Analyze historical data to identify trends and problem areas. + - Predict potential outcomes for future assessments based on past performance (e.g., meeting deadlines, reducing red flags). + - Provide **actionable recommendations** that can help improve performance in future assessments. + + **User Query**: + "{query}" + + **Your Response**: + Predict and provide recommendations based on the company’s historical data, focusing on the areas most relevant to the query. Ensure the response is based on past trends and performance issues. + + **Examples of Insightful Responses**: + - "To improve your performance in the next assessment, you should focus on reducing red flags in the Communication department, as it has had the most issues." + - "Based on the company's past performance, there is a 70% chance that you will meet the deadline for the next weekly assessment. To ensure success, focus on completing open items in the IT department." + - "The data indicates that quarterly assessments have the highest rate of incomplete tasks. I recommend prioritizing quarterly assessment tasks to avoid falling behind." + """ + + return prompt + + + +def predict_next_n_assessments_prompt(): + + # Create the prompt with provided company info, summary statistics, and number of assessments (n) + prompt = """ + **Prompt for the Chatbot:** + + **Context:** + You are an AI assistant responsible for analyzing the past assessment data of , and your primary responsibility is to provide **predictions** for the next {n} assessments. + These assessments can occur on a **weekly**, **bi-weekly**, or **quarterly** basis. Use the company's past performance to predict the following for each of the next {n} assessments: + - **Number of Open Items**. + - **Number of Red Flags**. + - **Predictions for Weekly, Bi-Weekly, and Quarterly assessments**. + input : + - company basic info + - past assessment statitics + - N - number of next assessments to be predicted + **General Company Information:** + + + **Assessment Summary (Past Data)**: + The Detailed information on past asssessment will be provided. Use this information to make predictions for the next {n} assessments. + + + **Instructions**: + - Predict the number of open items and red flags for the next n assessments if they are conducted on a weekly, bi-weekly, or quarterly basis. + - Use the historical summary statistics provided above to guide your predictions. + - Return the response in the following JSON format: + + **Response Format**: + + { + "assessment 1": [ + { + "weekly": {"open_items": X, "red_flags": Y}}, + "biweekly": {{"open_items": X, "red_flags": Y}}, + "quarterly": {{"open_items": X, "red_flags": Y}} + } + ], + "assessment 2": [ + { + "weekly": {"open_items": X, "red_flags": Y}, + "biweekly": {"open_items": X, "red_flags": Y}, + "quarterly": {"open_items": X, "red_flags": Y} + } + ] + // assuming N is 2 + } + ``` + Ensure each assessment is provided with three predictions: one for Weekly, one for Bi-Weekly, and one for Quarterly assessments. + """ + + return prompt diff --git a/src/services/chatbot.py b/src/services/chatbot.py index d9cffbd..493dba8 100644 --- a/src/services/chatbot.py +++ b/src/services/chatbot.py @@ -7,6 +7,7 @@ from src.prompts.sops import * from src.prompts.chatbot import * from src.models.sop_response_schemas import * from src.models.bot_response_schema import * +from scripts.assessment_data import generate_summary_stats_v2 from dotenv import load_dotenv load_dotenv() @@ -52,7 +53,7 @@ class Chatbot: } ], response_format=ValidateWorker, - max_tokens=4096, + max_tokens=1024, temperature=0.1 ) @@ -64,3 +65,121 @@ class Chatbot: except Exception as e: print(f"An error occurred: {e}") return None + + + + + def predict_based_on_past_assessment(self, query, company_info, companyid) -> Result: + """ + This method generates predictions based on past assessment data of a company. It queries the backend for the + company's assessment data, generates a prompt, and then uses the GPT-4 model to return predictions based on the query. + + :param query: The question or query asked by the user. + :param company_info: General information about the company (name, size, departments, etc.). + :param companyid: Unique identifier of the company to fetch its specific data. + :return: Result containing the prediction result or None if an error occurs. + """ + try: + # Define the path to the company's assessment data (stored as a CSV) + data_path = os.path.join('data', 'raw', 'erp_company_assessment', f'{companyid}_raw_data.csv') + + # Generate summary statistics from the company's assessment data + summary_stats = generate_summary_stats_v2(file_path=data_path) + + + # Generate the prompt using the company info and the summary statistics + prompt = predict_based_past_assessment_prompt( + query=query, + company_info=company_info, + summary_stats=summary_stats + ) + + # Interact with GPT-4 model to get a response + response = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": f"{prompt}" + }, + { + "role": "user", + "content": f"{query}", + } + ], + response_format=Result, + max_tokens=1024, + temperature=0.1 + ) + + # Extract and return the response from the GPT-4 model + extracted_text = json.loads(response.choices[0].message.content) + + return extracted_text + + except Exception as e: + print(f"An error occurred: {e}") + return None + + + def predict_next_n_assessment(self, company_info, companyid, N) -> AssessmentPredictionsResponse: + """ + This method generates predictions based on past assessment data of a company. It queries the backend for the + company's assessment data, generates a prompt, and then uses the GPT-4 model to return predictions based on the query. + + :param query: The question or query asked by the user. + :param company_info: General information about the company (name, size, departments, etc.). + :param companyid: Unique identifier of the company to fetch its specific data. + :param N: Number of assessments to predict. + :return: Result containing the prediction result or None if an error occurs. + """ + try: + # Define the path to the company's assessment data (stored as a CSV) + data_path = os.path.join('data', 'raw', 'erp_company_assessment', f'{companyid}_raw_data.csv') + + # Generate summary statistics from the company's assessment data + summary_stats = generate_summary_stats_v2(file_path=data_path) + + # Generate the prompt using the company info and the summary statistics + prompt = predict_next_n_assessments_prompt() + + + # Interact with GPT-4 model to get a response + response = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "system", + "content": f"{prompt}" + }, + { + "role": "user", + "content": f"company info: {company_info}--> N-value is {N} ", + }, + { + "role": "user", + "content": f"Summary stats: {summary_stats}", + } + ], + response_format=AssessmentPredictionsResponse, + max_tokens=1024, + temperature=0.1 + ) + + # Extract the response from the GPT-4 model + extracted_text = json.loads(response.choices[0].message.content) + + # Initialize dictionary to store assessments with dynamic names + predictions = {} + + # Loop through the predicted assessments and rename them dynamically + for i in range(N): + assessment_key = f"assessment_{i + 1}" + predictions[assessment_key] = extracted_text["predictions"][i]['AssessmentN'] + + # Return the dynamically named assessments + return predictions + + except Exception as e: + print(f"An error occurred: {e}") + return None diff --git a/test.py b/test.py index 11fbc78..09ca595 100644 --- a/test.py +++ b/test.py @@ -6,9 +6,7 @@ input_base_path = '/root/ds_erp_ai/data/raw/erp_assessment_prediction' # The ba pipeline = CompanyModelPipeline(company_ids=company_ids, input_base_path=input_base_path) pipeline.run_pipeline()''' -from src.pipeline.inference import AssessmentInference - - +'''from src.pipeline.inference import AssessmentInference inference = AssessmentInference( company_id="testid",num_assessments=2 @@ -18,3 +16,32 @@ result = inference.run() print(result) +''' +''' +response2 = bot.predict_next_n_assessment( + company_info=company_info, + companyid="testid", + N=3 +) + +print(f"Predictions {response2}") +''' + +from src.services.chatbot import Chatbot +company_info = { + 'company_name': "ABC Corp", + 'company_size': "Medium", # Can be "Small", "Medium", or "Large" + 'departments': ["Sales", "Marketing", "IT", "Finance", "HR", "Logistics"] +} +bot = Chatbot() +response = bot.predict_based_on_past_assessment( + query="Should i make my next assessment weekly or biweekly to meet up to deadline?", + company_info=company_info, + companyid="testid" +) + +print(f"Result: {response}") + + + +