import pandas as pd def generate_summary_stats_v2(file_path): # Load the DataFrame from the provided file path df = pd.read_csv(file_path) # Ensure date columns are correctly parsed df['Assessment_Start_Date'] = pd.to_datetime(df['Assessment_Start_Date']) df['Assessment_End_Date'] = pd.to_datetime(df['Assessment_End_Date']) # Add completion rate calculation completed_status = df['Assessment_Status'] == 'Completed' completion_rate_by_frequency = df[completed_status].groupby('Assessment_Frequency').size() / df.groupby('Assessment_Frequency').size() in_progress_status = df['Assessment_Status'] == 'In Progress' incomplete_status = df['Assessment_Status'] == 'Incomplete' # Calculate in-progress and incomplete rates by frequency in_progress_rate_by_frequency = df[in_progress_status].groupby('Assessment_Frequency').size() / df.groupby('Assessment_Frequency').size() incomplete_rate_by_frequency = df[incomplete_status].groupby('Assessment_Frequency').size() / df.groupby('Assessment_Frequency').size() # Fill NaN values (where no assessments are in-progress or incomplete for certain frequencies) completion_rate_by_frequency = completion_rate_by_frequency.fillna(0) in_progress_rate_by_frequency = in_progress_rate_by_frequency.fillna(0) incomplete_rate_by_frequency = incomplete_rate_by_frequency.fillna(0) # Round all numerical values to 2 decimal places completion_rate_by_frequency = completion_rate_by_frequency.round(2) in_progress_rate_by_frequency = in_progress_rate_by_frequency.round(2) incomplete_rate_by_frequency = incomplete_rate_by_frequency.round(2) summary_stats = { 'Open Items and Red Flags': { 'Total Open Items': round(df['Open_Items'].sum(), 2), 'Average Open Items per Assessment': round(df['Open_Items'].mean(), 2), 'Total Red Flags': round(df['Red_Flags'].sum(), 2), 'Average Red Flags per Assessment': round(df['Red_Flags'].mean(), 2), 'Max Red Flags in a Single Assessment': round(df['Red_Flags'].max(), 2), 'Most Common Area with Red Flags': df[df['Red_Flags'] > 0]['Assessment_Area'].mode()[0] }, 'Assessment Frequency': { 'Assessment Type Breakdown': df['Assessment_Frequency'].value_counts(normalize=True).round(2).to_dict(), 'Average Time Between Assessments': round((df['Assessment_End_Date'] - df['Assessment_Start_Date']).dt.days.mean(), 2), 'Average Assessment Duration': round(df['Assessment_End_Date'].sub(df['Assessment_Start_Date']).dt.days.mean(), 2), 'Completion Rate by Frequency': completion_rate_by_frequency.to_dict(), 'In Progress Rate by Frequency': in_progress_rate_by_frequency.to_dict(), 'Incomplete Rate by Frequency': incomplete_rate_by_frequency.to_dict() }, 'Assessment Start and End Dates': { 'Longest Assessment Duration (days)': round(df['Assessment_End_Date'].sub(df['Assessment_Start_Date']).dt.days.max(), 2), 'Shortest Assessment Duration (days)': round(df['Assessment_End_Date'].sub(df['Assessment_Start_Date']).dt.days.min(), 2), }, 'Assessment Areas': { 'Most Assessed Area': df['Assessment_Area'].value_counts().idxmax(), 'Most Open Items in Area': df.groupby('Assessment_Area')['Open_Items'].sum().idxmax(), 'Area with Most Red Flags': df.groupby('Assessment_Area')['Red_Flags'].sum().idxmax() }, 'Assessment Status': { 'Assessment Status Distribution': df['Assessment_Status'].value_counts(normalize=True).round(2).to_dict(), 'Incomplete Assessments': round(df[df['Assessment_Status'] == 'Incomplete'].shape[0], 2), 'In Progress Assessments': round(df[df['Assessment_Status'] == 'In Progress'].shape[0], 2) }, 'Assessment Admin': { 'Most Frequent Admin': df['Assessment_Admin'].mode()[0], 'Admin with Fewest Red Flags': df.groupby('Assessment_Admin')['Red_Flags'].sum().idxmin(), 'Admin with Most Open Items': df.groupby('Assessment_Admin')['Open_Items'].mean().idxmax() } } return summary_stats