scripts/generate_summary.py

import os
import logging
from typing import Dict, List, Any, Optional
from dotenv import load_dotenv
import json
from pydantic import BaseModel
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from src.prompt import advanced_summary_prompt, basic_summary_prompt, custom_template_prompt

load_dotenv()

# Setup logger
logger = logging.getLogger(__name__)

# ============================================================================
# Pydantic Models for Structured Outputs
# ============================================================================

# Basic Summary Models (Freemium Plan)
class KeyPoint(BaseModel):
    """A key point from the meeting"""
    text: str
    timestamp: float

class Summary(BaseModel):
    """Overall summary of the meeting"""
    text: str
    duration_minutes: float

class BasicSummary(BaseModel):
    """Basic summary structure for freemium plan"""
    Key_Points: List[KeyPoint]
    Summary: Summary


# Advanced Summary Models (Pro Plan)
class Purpose(BaseModel):
    """Purpose of the meeting"""
    text: str

class ChapterContent(BaseModel):
    """Content item within a chapter"""
    text: str
    original_transcript_start: float
    original_transcript_end: float

class WordTimestamp(BaseModel):
    """Word-level timestamp"""
    word: str
    timestamp: float

class TimeStamp(BaseModel):
    """Time range"""
    start: float
    end: float

class Chapter(BaseModel):
    """A chapter in the meeting"""
    chapter: str
    time_stamp: TimeStamp
    content: List[ChapterContent]
    words_time_stamp: List[WordTimestamp]

class Chapters(BaseModel):
    """Chapters section"""
    minutes_total: float
    content: List[Chapter]

class OutcomeContent(BaseModel):
    """Content item in outcomes"""
    text: str
    time_stamp: TimeStamp
    words_time_stamp: List[WordTimestamp]

class Outcomes(BaseModel):
    """Outcomes section"""
    minutes_total: float
    content: List[OutcomeContent]

class ActionItem(BaseModel):
    """An action item"""
    text: str
    time_stamp: TimeStamp
    words_time_stamp: List[WordTimestamp]

class ActionItemsPerUser(BaseModel):
    """Action items for a specific user"""
    speaker: str
    minutes_total: float
    action_items: List[ActionItem]

class AdvancedSummary(BaseModel):
    """Advanced summary structure for pro plan"""
    Purpose: Purpose
    Chapters: Chapters
    Outcomes: Outcomes
    Action_Items_Per_User: List[ActionItemsPerUser]


# ============================================================================
# Summary Generation Functions
# ============================================================================

def general_summary(transcription, plan_tier="pro"):
    """
    Generate a summary of the transcription based on the user's plan tier.
    Uses LangChain Anthropic with structured outputs.

    Args:
        transcription: The transcription to summarize (dict or JSON string)
        plan_tier: The user's plan tier ("freemium" or "pro")

    Returns:
        A dict containing the summary (structured output)
    """
    # Get API key (note: original code had typo ANTHTROPIC_API_KEY)
    api_key = os.getenv("ANTHROPIC_API_KEY") or os.getenv("ANTHTROPIC_API_KEY")
    if not api_key:
        raise ValueError("ANTHROPIC_API_KEY environment variable is required")
    
    logger.info(f"Generating {plan_tier} summary with structured output")

    # Convert transcription to string if it's a dict
    if isinstance(transcription, dict):
        transcription_str = json.dumps(transcription)
    else:
        transcription_str = str(transcription)

    # Select the appropriate prompt and schema based on the user's plan tier
    if plan_tier.lower() == "freemium":
        prompt = basic_summary_prompt
        max_tokens = 2000
        output_schema = BasicSummary
    else:  # Default to pro
        prompt = advanced_summary_prompt
        max_tokens = 4000
        output_schema = AdvancedSummary

    # Initialize LangChain Anthropic model
    model = ChatAnthropic(
        model="claude-sonnet-4-5-20250929",
        api_key=api_key,
        temperature=0.2,
        max_tokens=max_tokens
    )

    # Create messages directly to avoid template variable parsing issues
    messages = [
        SystemMessage(content="You are an AI meeting transcript summary formatter. Follow the instructions carefully and return structured output."),
        HumanMessage(content=prompt + "\n\nTranscription: " + transcription_str)
    ]

    # Use structured output
    structured_model = model.with_structured_output(output_schema)
    
    try:
        # Invoke the structured model with messages
        result = structured_model.invoke(messages)
        
        # Convert Pydantic model to dict
        if isinstance(result, BaseModel):
            logger.info(f"Successfully generated {plan_tier} summary with structured output")
            return result.model_dump()
        else:
            logger.info(f"Successfully generated {plan_tier} summary")
            return result
            
    except Exception as e:
        # Log error and return fallback
        logger.error(f"Error generating summary with structured output: {e}")
        # Fallback: try without structured output
        try:
            logger.warning("Falling back to non-structured output")
            response = model.invoke(messages)
            text = response.content if hasattr(response, 'content') else str(response)
            return json.loads(text)
        except Exception as fallback_error:
            logger.error(f"Fallback also failed: {fallback_error}")
            raise


def custom_summary(template, transcription):
    """
    Generate a custom summary based on a user-defined template.
    Uses LangChain Anthropic.

    Args:
        template: The custom template (dict or JSON string)
        transcription: The transcription to summarize (dict or JSON string)

    Returns:
        A dict containing the custom summary
    """
    # Get API key (note: original code had typo ANTHTROPIC_API_KEY)
    api_key = os.getenv("ANTHROPIC_API_KEY") or os.getenv("ANTHTROPIC_API_KEY")
    if not api_key:
        raise ValueError("ANTHROPIC_API_KEY environment variable is required")
    
    logger.info("Generating custom summary")

    # Convert to strings if needed
    if isinstance(template, dict):
        template_str = json.dumps(template)
    else:
        template_str = str(template)
        
    if isinstance(transcription, dict):
        transcription_str = json.dumps(transcription)
    else:
        transcription_str = str(transcription)

    # Initialize LangChain Anthropic model
    model = ChatAnthropic(
        model="claude-sonnet-4-5-20250929",  # Using the same model as general_summary
        api_key=api_key,
        temperature=0.2,
        max_tokens=8000
    )

    # Create messages directly to avoid template variable parsing issues
    messages = [
        SystemMessage(content="You are an AI meeting transcript summary formatter. Follow the user-defined template structure exactly."),
        HumanMessage(content=custom_template_prompt + "\n\nTEMPLATE: " + template_str + "\n\nTranscription: " + transcription_str)
    ]
    
    try:
        response = model.invoke(messages)
        text = response.content if hasattr(response, 'content') else str(response)
        
        # Try to parse as JSON
        try:
            return json.loads(text)
        except json.JSONDecodeError:
            # If it's wrapped in markdown code blocks, try to extract JSON
            if "```json" in text:
                json_start = text.find("```json") + 7
                json_end = text.find("```", json_start)
                text = text[json_start:json_end].strip()
                return json.loads(text)
            elif "```" in text:
                json_start = text.find("```") + 3
                json_end = text.find("```", json_start)
                text = text[json_start:json_end].strip()
                return json.loads(text)
            else:
                raise ValueError(f"Could not parse response as JSON: {text[:200]}")
                
    except Exception as e:
        logger.error(f"Error generating custom summary: {e}")
        raise
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`import os`
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`import logging`
			`from typing import Dict, List, Any, Optional`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`from dotenv import load_dotenv`
			`import json`
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`from pydantic import BaseModel`
			`from langchain_anthropic import ChatAnthropic`
			`from langchain_core.prompts import ChatPromptTemplate`
			`from langchain_core.messages import HumanMessage, SystemMessage`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`from src.prompt import advanced_summary_prompt, basic_summary_prompt, custom_template_prompt`
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`load_dotenv()`

Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`# Setup logger`
			`logger = logging.getLogger(__name__)`

			`# ============================================================================`
			`# Pydantic Models for Structured Outputs`
			`# ============================================================================`

			`# Basic Summary Models (Freemium Plan)`
			`class KeyPoint(BaseModel):`
			`"""A key point from the meeting"""`
			`text: str`
			`timestamp: float`

			`class Summary(BaseModel):`
			`"""Overall summary of the meeting"""`
			`text: str`
			`duration_minutes: float`

			`class BasicSummary(BaseModel):`
			`"""Basic summary structure for freemium plan"""`
			`Key_Points: List[KeyPoint]`
			`Summary: Summary`


			`# Advanced Summary Models (Pro Plan)`
			`class Purpose(BaseModel):`
			`"""Purpose of the meeting"""`
			`text: str`

			`class ChapterContent(BaseModel):`
			`"""Content item within a chapter"""`
			`text: str`
			`original_transcript_start: float`
			`original_transcript_end: float`

			`class WordTimestamp(BaseModel):`
			`"""Word-level timestamp"""`
			`word: str`
			`timestamp: float`

			`class TimeStamp(BaseModel):`
			`"""Time range"""`
			`start: float`
			`end: float`

			`class Chapter(BaseModel):`
			`"""A chapter in the meeting"""`
			`chapter: str`
			`time_stamp: TimeStamp`
			`content: List[ChapterContent]`
			`words_time_stamp: List[WordTimestamp]`

			`class Chapters(BaseModel):`
			`"""Chapters section"""`
			`minutes_total: float`
			`content: List[Chapter]`

			`class OutcomeContent(BaseModel):`
			`"""Content item in outcomes"""`
			`text: str`
			`time_stamp: TimeStamp`
			`words_time_stamp: List[WordTimestamp]`

			`class Outcomes(BaseModel):`
			`"""Outcomes section"""`
			`minutes_total: float`
			`content: List[OutcomeContent]`

			`class ActionItem(BaseModel):`
			`"""An action item"""`
			`text: str`
			`time_stamp: TimeStamp`
			`words_time_stamp: List[WordTimestamp]`

			`class ActionItemsPerUser(BaseModel):`
			`"""Action items for a specific user"""`
			`speaker: str`
			`minutes_total: float`
			`action_items: List[ActionItem]`

			`class AdvancedSummary(BaseModel):`
			`"""Advanced summary structure for pro plan"""`
			`Purpose: Purpose`
			`Chapters: Chapters`
			`Outcomes: Outcomes`
			`Action_Items_Per_User: List[ActionItemsPerUser]`


			`# ============================================================================`
			`# Summary Generation Functions`
			`# ============================================================================`

Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`def general_summary(transcription, plan_tier="pro"):`
			`"""`
			`Generate a summary of the transcription based on the user's plan tier.`
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`Uses LangChain Anthropic with structured outputs.`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00
			`Args:`
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`transcription: The transcription to summarize (dict or JSON string)`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`plan_tier: The user's plan tier ("freemium" or "pro")`

			`Returns:`
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`A dict containing the summary (structured output)`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`"""`
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`# Get API key (note: original code had typo ANTHTROPIC_API_KEY)`
			`api_key = os.getenv("ANTHROPIC_API_KEY") or os.getenv("ANTHTROPIC_API_KEY")`
			`if not api_key:`
			`raise ValueError("ANTHROPIC_API_KEY environment variable is required")`

			`logger.info(f"Generating {plan_tier} summary with structured output")`

			`# Convert transcription to string if it's a dict`
			`if isinstance(transcription, dict):`
			`transcription_str = json.dumps(transcription)`
			`else:`
			`transcription_str = str(transcription)`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`# Select the appropriate prompt and schema based on the user's plan tier`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`if plan_tier.lower() == "freemium":`
			`prompt = basic_summary_prompt`
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`max_tokens = 2000`
			`output_schema = BasicSummary`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`else: # Default to pro`
			`prompt = advanced_summary_prompt`
			`max_tokens = 4000`
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`output_schema = AdvancedSummary`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`# Initialize LangChain Anthropic model`
			`model = ChatAnthropic(`
			`model="claude-sonnet-4-5-20250929",`
			`api_key=api_key,`
			`temperature=0.2,`
			`max_tokens=max_tokens`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`)`

Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`# Create messages directly to avoid template variable parsing issues`
			`messages = [`
			`SystemMessage(content="You are an AI meeting transcript summary formatter. Follow the instructions carefully and return structured output."),`
			`HumanMessage(content=prompt + "\n\nTranscription: " + transcription_str)`
			`]`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`# Use structured output`
			`structured_model = model.with_structured_output(output_schema)`

			`try:`
			`# Invoke the structured model with messages`
			`result = structured_model.invoke(messages)`

			`# Convert Pydantic model to dict`
			`if isinstance(result, BaseModel):`
			`logger.info(f"Successfully generated {plan_tier} summary with structured output")`
			`return result.model_dump()`
			`else:`
			`logger.info(f"Successfully generated {plan_tier} summary")`
			`return result`

			`except Exception as e:`
			`# Log error and return fallback`
			`logger.error(f"Error generating summary with structured output: {e}")`
			`# Fallback: try without structured output`
			`try:`
			`logger.warning("Falling back to non-structured output")`
			`response = model.invoke(messages)`
			`text = response.content if hasattr(response, 'content') else str(response)`
			`return json.loads(text)`
			`except Exception as fallback_error:`
			`logger.error(f"Fallback also failed: {fallback_error}")`
			`raise`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00

			`def custom_summary(template, transcription):`
Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`"""`
			`Generate a custom summary based on a user-defined template.`
			`Uses LangChain Anthropic.`

			`Args:`
			`template: The custom template (dict or JSON string)`
			`transcription: The transcription to summarize (dict or JSON string)`

			`Returns:`
			`A dict containing the custom summary`
			`"""`
			`# Get API key (note: original code had typo ANTHTROPIC_API_KEY)`
			`api_key = os.getenv("ANTHROPIC_API_KEY") or os.getenv("ANTHTROPIC_API_KEY")`
			`if not api_key:`
			`raise ValueError("ANTHROPIC_API_KEY environment variable is required")`

			`logger.info("Generating custom summary")`

			`# Convert to strings if needed`
			`if isinstance(template, dict):`
			`template_str = json.dumps(template)`
			`else:`
			`template_str = str(template)`

			`if isinstance(transcription, dict):`
			`transcription_str = json.dumps(transcription)`
			`else:`
			`transcription_str = str(transcription)`

			`# Initialize LangChain Anthropic model`
			`model = ChatAnthropic(`
			`model="claude-sonnet-4-5-20250929", # Using the same model as general_summary`
			`api_key=api_key,`
			`temperature=0.2,`
			`max_tokens=8000`
Add tiered summarization based on pricing plans 2025-04-24 10:15:13 +01:00			`)`

Fix ChatPromptTemplate variable parsing issue and update tests 2025-11-11 20:11:53 +00:00			`# Create messages directly to avoid template variable parsing issues`
			`messages = [`
			`SystemMessage(content="You are an AI meeting transcript summary formatter. Follow the user-defined template structure exactly."),`
			`HumanMessage(content=custom_template_prompt + "\n\nTEMPLATE: " + template_str + "\n\nTranscription: " + transcription_str)`
			`]`

			`try:`
			`response = model.invoke(messages)`
			`text = response.content if hasattr(response, 'content') else str(response)`

			`# Try to parse as JSON`
			`try:`
			`return json.loads(text)`
			`except json.JSONDecodeError:`
			`# If it's wrapped in markdown code blocks, try to extract JSON`
			if "```json" in text:
			json_start = text.find("```json") + 7
			json_end = text.find("```", json_start)
			`text = text[json_start:json_end].strip()`
			`return json.loads(text)`
			elif "```" in text:
			json_start = text.find("```") + 3
			json_end = text.find("```", json_start)
			`text = text[json_start:json_end].strip()`
			`return json.loads(text)`
			`else:`
			`raise ValueError(f"Could not parse response as JSON: {text[:200]}")`

			`except Exception as e:`
			`logger.error(f"Error generating custom summary: {e}")`
			`raise`