Add tiered summarization based on pricing plans

- Implement advanced AI summarization with action items for Pro plan - Create basic bullet-point summarization for Freemium plan - Add plan tier validation and feature differentiation - Support speaker identification in transcripts - Define plan limits (600 mins Pro/200 mins Freemium)
2025-04-24 17:18:53 +01:00
parent 316e82b6cf
commit a91613efe2
6 changed files with 270 additions and 73 deletions
@@ -33,11 +33,12 @@ Transcribes audio or video content from a URL.
 ```json
 {
  "media_url": "https://example.com/audio-file.mp3",
-  "media_type": "audio"  // "audio" or "video"
+  "media_type": "audio",  // "audio" or "video"
  "plan_tier": "freemium"  // "freemium" or "pro" (optional, defaults to "freemium")
 }
 ```
-**Response:**
+**Pro Plan Response (with speaker identification):**
 ```json
 {
@@ -45,7 +46,33 @@ Transcribes audio or video content from a URL.
    "sentences": [
      {
        "sentence": "Hello and welcome to the meeting.",
-        "speaker": "speaker_0",
+        "speaker": "speaker_0",  // Speaker identification included
        "start": 0.0,
        "end": 2.5,
        "words": [
          {
            "word": "Hello",
            "start": 0.0,
            "end": 0.4
          },
          // Additional words...
        ]
      },
      // Additional sentences...
    ]
  }
 }
 ```
 **Freemium Plan Response (without speaker identification):**
 ```json
 {
  "transcript": {
    "sentences": [
      {
        "sentence": "Hello and welcome to the meeting.",
        // No "speaker" field in freemium plan
        "start": 0.0,
        "end": 2.5,
        "words": [
@@ -271,14 +298,15 @@ headers = {
    "Content-Type": "application/json"
 }
-# 1. Transcribe an audio file
+# 1. Transcribe an audio file (Pro plan with speaker identification)
 transcribe_payload = {
    "media_url": "https://example.com/meeting-recording.mp3",
-    "media_type": "audio"
+    "media_type": "audio",
    "plan_tier": "pro"  # Specify "pro" for speaker identification or "freemium" for no speakers
 }
 transcribe_response = requests.post(
-    f"{base_url}/microdot-ai/transcribe", 
+    f"{base_url}/microdot-ai/transcribe",
    headers=headers,
    json=transcribe_payload
 )
@@ -287,19 +315,19 @@ transcribe_response = requests.post(
 if transcribe_response.status_code == 200:
    transcript_data = transcribe_response.json()
    transcript_json = json.dumps(transcript_data["transcript"])
-    
+
    # 2. Generate a summary (Pro plan)
    summary_payload = {
        "transcript": transcript_json,
        "plan_tier": "pro"
    }
-    
+
    summary_response = requests.post(
-        f"{base_url}/microdot-ai/general-summary", 
+        f"{base_url}/microdot-ai/general-summary",
        headers=headers,
        json=summary_payload
    )
-    
+
    if summary_response.status_code == 200:
        summary_data = summary_response.json()
        print("Summary generated successfully!")
@@ -316,7 +344,7 @@ else:
 const axios = require('axios');
 // API endpoint
-const baseUrl = 'https://api.microdot.ai';
+const baseUrl = 'http://0.0.0.0:5056';
 // Your API key
 const apiKey = 'your_api_key_here';
@@ -331,34 +359,35 @@ const headers = {
 const transcribeAudio = async () => {
  const transcribePayload = {
    media_url: 'https://example.com/meeting-recording.mp3',
-    media_type: 'audio'
+    media_type: 'audio',
    plan_tier: 'pro'  // Specify 'pro' for speaker identification or 'freemium' for no speakers
  };
-  
+
  try {
    const transcribeResponse = await axios.post(
      `${baseUrl}/microdot-ai/transcribe`,
      transcribePayload,
      { headers }
    );
-    
+
    const transcriptData = transcribeResponse.data;
    const transcriptJson = JSON.stringify(transcriptData.transcript);
-    
+
    // 2. Generate a summary (Pro plan)
    const summaryPayload = {
      transcript: transcriptJson,
      plan_tier: 'pro'
    };
-    
+
    const summaryResponse = await axios.post(
      `${baseUrl}/microdot-ai/general-summary`,
      summaryPayload,
      { headers }
    );
-    
+
    console.log('Summary generated successfully!');
    console.log(JSON.stringify(summaryResponse.data.transcript, null, 2));
-    
+
  } catch (error) {
    console.error('Error:', error.response ? error.response.data : error.message);
  }
@@ -55,6 +55,7 @@ async def get_api_key(api_key_header: str = Security(api_key_header)) -> str:
 class TranscribeRequest(BaseModel):
    media_url: Optional[str] = None
    media_type: Optional[str]  # Corrected type hint for media_type
    plan_tier: Optional[str] = "freemium"  # Default to freemium plan if not specified
 class ChatResp(BaseModel):  # Added BaseModel inheritance
    error: Optional[str] = None
@@ -78,6 +79,16 @@ async def chat_endpoint(
    api_key: str = Depends(get_api_key)
 ):
    try:
        # Get the plan tier from the request or default to freemium
        plan_tier = request.plan_tier.lower() if request.plan_tier else "freemium"
        # Validate plan tier using our PlanTier enum
        valid_tiers = [t.value for t in PlanTier]
        if plan_tier not in valid_tiers:
            plan_tier = PlanTier.FREEMIUM.value  # Default to freemium if invalid tier
        # Check if the plan includes speaker identification
        include_speakers = PlanLimits.get_limit(plan_tier, "speaker_identification")
        # Use the transcribe_media function to transcribe the media
        if request.media_url:
@@ -88,7 +99,7 @@ async def chat_endpoint(
        # Parse response
        words = transcription_response["results"]["channels"][0]["alternatives"][0]["words"]
-        transcript = group_words_into_sentences(words=words)
+        transcript = group_words_into_sentences(words=words, include_speakers=include_speakers)
        return TranscriptResponse(
            transcript=transcript,  # Corrected to return the transcript
            error=None
@@ -28,7 +28,7 @@ def extract_audio(url: str, output_template=os.path.join(UPLOAD_FOLDER, "%(title
    """
    Download and extract audio from a video URL using yt-dlp.
    The file will be saved in the 'upload' folder.
-    
+
    Returns:
        str: The absolute path to the downloaded audio file (with a unique id appended).
    """
@@ -42,51 +42,51 @@ def extract_audio(url: str, output_template=os.path.join(UPLOAD_FOLDER, "%(title
        }],
        "quiet": True,
    }
-    
+
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        # Prepare the filename from the info.
        # Note: prepare_filename returns the filename *before* postprocessing,
        # so we change the extension to mp3.
        original_filepath = os.path.splitext(ydl.prepare_filename(info))[0] + ".mp3"
-    
+
    # Debug: list files in the upload folder
    if not os.path.exists(original_filepath):
        files = os.listdir(UPLOAD_FOLDER)
        print("Warning: Could not find expected file.")
        print("Files in upload folder:", files)
        raise FileNotFoundError(f"Expected audio file not found: {original_filepath}")
-    
+
    # Get the video's title and sanitize it
    title = info.get('title', 'audio')
    safe_title = sanitize_filename(title)
-    
+
    # Generate a unique identifier
    unique_id = uuid.uuid4().hex  # Unique identifier in hex format
-    
+
    # Construct the new filename with the unique id appended.
    new_audio_filename = f"{safe_title}_{unique_id}.mp3"
    new_audio_filepath = os.path.join(UPLOAD_FOLDER, new_audio_filename)
-    
+
    # Rename the downloaded file to include the unique ID.
    os.rename(original_filepath, new_audio_filepath)
    print(f"Renamed file to: {new_audio_filepath}")
-    
+
    # Return the absolute path to the renamed audio file.
    return os.path.abspath(new_audio_filepath)
 def transcribe_media(file_loc: str, media_type: str = "audio"):
    """
    Transcribe media using Deepgram.
-    
+
    If media_type is "audio" (remote URL), use Deepgram's URL transcription.
    If media_type is "video" (remote URL), extract audio locally (in the upload folder),
    transcribe via file, and then delete the local audio file.
-    
+
    Args:
        file_loc (str): URL to the remote audio or video file.
        media_type (str): "audio" or "video".
-    
+
    Returns:
        dict: The transcription response from Deepgram.
    """
@@ -100,22 +100,22 @@ def transcribe_media(file_loc: str, media_type: str = "audio"):
            smart_format=True,
            diarize=True,
        )
-        
+
        if media_type.lower() == "audio":
            # For remote audio files, use the URL transcription method.
            response = deepgram.listen.rest.v("1").transcribe_url({"url": file_loc}, options)
-        
+
        elif media_type.lower() == "video":
            # For remote video files, first extract the audio locally.
            local_audio_path = extract_audio(file_loc)
            print(f"Extracted audio to: {local_audio_path}")
-            
+
            # Transcribe using the local file method.
            with open(local_audio_path, "rb") as file:
                buffer_data = file.read()
            payload: FileSource = {"buffer": buffer_data}
            response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
-            
+
            # Clean up: delete the local audio file.
            if os.path.exists(local_audio_path):
                os.remove(local_audio_path)
@@ -123,7 +123,7 @@ def transcribe_media(file_loc: str, media_type: str = "audio"):
        else:
            raise ValueError("media_type must be either 'audio' or 'video'.")
-       
+
        return response
    except Exception as e:
@@ -137,54 +137,76 @@ def transcribe_media(file_loc: str, media_type: str = "audio"):
-def group_words_into_sentences(words, max_words=15):
+def group_words_into_sentences(words, max_words=15, include_speakers=True):
    """
    Group words into sentences based on speaker changes.
    Args:
        words: List of word objects from the transcription
        max_words: Maximum number of words per sentence
        include_speakers: Whether to include speaker information in the output
                         (True for Pro plan, False for Freemium plan)
    Returns:
        A dictionary containing the sentences
    """
    sentences = []
    current_sentence = []
    current_speaker = None
    start_time = None
-    
+
    for i, word_info in enumerate(words):
        word = word_info["punctuated_word"]
-        speaker = word_info["speaker"]
+        speaker = word_info["speaker"] if include_speakers else "speaker_0"  # Use a default speaker if not including speakers
        start = word_info["start"]
        end = word_info["end"]
-        
+
        # If speaker changes or sentence reaches max length, start a new sentence
        if speaker != current_speaker:
            if current_sentence:
-                sentences.append({
+                sentence_obj = {
                    "sentence": " ".join([w["word"] for w in current_sentence]),
                    "speaker": current_speaker,
                    "start": start_time,
                    "end": words[i-1]["end"],
                    "words": current_sentence
-                })
+                }
                # Only include speaker information if include_speakers is True
                if include_speakers:
                    sentence_obj["speaker"] = current_speaker
                sentences.append(sentence_obj)
            current_sentence = []
            current_speaker = speaker
            start_time = start
-        
+
        # Append word with metadata inside the current sentence
        current_sentence.append({"word": word, "start": start, "end": end})
    # Append the last sentence if any words remain
    if current_sentence:
-        sentences.append({
+        sentence_obj = {
            "sentence": " ".join([w["word"] for w in current_sentence]),
            "speaker": current_speaker,
            "start": start_time,
            "end": words[-1]["end"],
            "words": current_sentence
-        })
+        }
-    
+
        # Only include speaker information if include_speakers is True
        if include_speakers:
            sentence_obj["speaker"] = current_speaker
        sentences.append(sentence_obj)
    return {"sentences": sentences}
 if __name__ == "__main__":
-    
+
    audio_url = "https://s3.us-east-2.amazonaws.com/com.mkdlabs.images/baas/jordan/019933724441Business%20English%20Conversation%20Lesson%2045_%20Meeting%20a%20New%20Colleague.mp3"
    video_url = "https://s3.us-east-2.amazonaws.com/com.mkdlabs.images/baas/jordan/038426704141Business%20English%20Conversation%20Lesson%2045_%20%20Meeting%20a%20New%20Colleague.mp4"
 # Folder for file uploads/downloads
-   
+
    response = transcribe_media(video_url, media_type="video")
    print(response)
@@ -15,13 +15,15 @@ class PlanLimits:
            "transcription_minutes": 200,
            "summary_type": "basic",
            "transcript_history_days": 7,
-            "integrations": ["google_meet", "zoom"]
+            "integrations": ["google_meet", "zoom"],
            "speaker_identification": False
        },
        PlanTier.PRO: {
            "transcription_minutes": 600,
            "summary_type": "advanced",
            "transcript_history_days": 30,
-            "integrations": ["google_meet", "zoom", "slack", "notion", "asana", "microsoft_teams"]
+            "integrations": ["google_meet", "zoom", "slack", "notion", "asana", "microsoft_teams"],
            "speaker_identification": True
        }
    }
@@ -19,33 +19,76 @@ headers = {
 # Audio URL from your notebook
 audio_url = "https://s3.us-east-2.amazonaws.com/com.mkdlabs.images/baas/jordan/019933724441Business%20English%20Conversation%20Lesson%2045_%20Meeting%20a%20New%20Colleague.mp3"
-# 1. First, transcribe the audio
+# 1. First, transcribe the audio with the Pro plan (with speaker identification)
-transcribe_payload = {
+pro_transcribe_payload = {
    "media_url": audio_url,
-    "media_type": "audio"
+    "media_type": "audio",
    "plan_tier": "pro"  # Specify the pro plan to include speaker identification
 }
-transcribe_response = requests.post(
+pro_transcribe_response = requests.post(
-    f"{base_url}/microdot-ai/transcribe", 
+    f"{base_url}/microdot-ai/transcribe",
    headers=headers,
-    json=transcribe_payload
+    json=pro_transcribe_payload
 )
-# Check if transcription was successful
+# Check if Pro plan transcription was successful
-if transcribe_response.status_code == 200:
+if pro_transcribe_response.status_code == 200:
-    transcript_data = transcribe_response.json()
+    pro_transcript_data = pro_transcribe_response.json()
-    print("Transcription successful!")
+    print("Pro plan transcription successful!")
    # Save the transcript for later use
    transcript_json = json.dumps(transcript_data["transcript"], indent=4)
    # Save the transcript to a file
    with open("transcript.json", "w") as f:
        f.write(transcript_json)
    print("Transcript saved to transcript.json")
    # Save the Pro plan transcript for later use
    pro_transcript_json = json.dumps(pro_transcript_data["transcript"], indent=4)
    # Save the Pro plan transcript to a file
    with open("pro_transcript.json", "w") as f:
        f.write(pro_transcript_json)
    print("Pro plan transcript saved to pro_transcript.json")
    # Check if the Pro plan transcript has speaker information
    has_speaker_pro = "speaker" in pro_transcript_data["transcript"]["sentences"][0] if pro_transcript_data["transcript"]["sentences"] else False
    print(f"Pro plan has speaker information: {has_speaker_pro}")
 else:
-    print(f"Transcription failed with status code: {transcribe_response.status_code}")
+    print(f"Pro plan transcription failed with status code: {pro_transcribe_response.status_code}")
-    print(transcribe_response.text)
+    print(pro_transcribe_response.text)
 # 1b. Now transcribe with the Free plan (without speaker identification)
 free_transcribe_payload = {
    "media_url": audio_url,
    "media_type": "audio",
    "plan_tier": "freemium"  # Specify the freemium plan to exclude speaker identification
 }
 free_transcribe_response = requests.post(
    f"{base_url}/microdot-ai/transcribe",
    headers=headers,
    json=free_transcribe_payload
 )
 # Check if Free plan transcription was successful
 if free_transcribe_response.status_code == 200:
    free_transcript_data = free_transcribe_response.json()
    print("Free plan transcription successful!")
    # Save the Free plan transcript for later use
    free_transcript_json = json.dumps(free_transcript_data["transcript"], indent=4)
    # Save the Free plan transcript to a file
    with open("free_transcript.json", "w") as f:
        f.write(free_transcript_json)
    print("Free plan transcript saved to free_transcript.json")
    # Check if the Free plan transcript has speaker information
    has_speaker_free = "speaker" in free_transcript_data["transcript"]["sentences"][0] if free_transcript_data["transcript"]["sentences"] else False
    print(f"Free plan has speaker information: {has_speaker_free}")
    # Use the Pro plan transcript for the summary tests
    transcript_json = pro_transcript_json
 else:
    print(f"Free plan transcription failed with status code: {free_transcribe_response.status_code}")
    print(free_transcribe_response.text)
    # If Free plan fails but Pro plan succeeded, use Pro plan transcript for summary tests
    if pro_transcribe_response.status_code == 200:
        transcript_json = pro_transcript_json
@@ -56,7 +99,7 @@ basic_summary_payload = {
 }
 basic_summary_response = requests.post(
-    f"{base_url}/microdot-ai/general-summary", 
+    f"{base_url}/microdot-ai/general-summary",
    headers=headers,
    json=basic_summary_payload
 )
@@ -83,7 +126,7 @@ advanced_summary_payload = {
 }
 advanced_summary_response = requests.post(
-    f"{base_url}/microdot-ai/general-summary", 
+    f"{base_url}/microdot-ai/general-summary",
    headers=headers,
    json=advanced_summary_payload
 )
@@ -93,7 +136,7 @@ if advanced_summary_response.status_code == 200:
    advanced_summary_data = advanced_summary_response.json()
    print("\n--- Advanced (Pro) Summary ---")
    advanced_summary_json = json.dumps(advanced_summary_data, indent=2)
-    
+
    with open("advanced_summary.json", "w") as f:
        f.write(advanced_summary_json)
    print("Advanced summary saved to advanced_summary.json")
@@ -0,0 +1,90 @@
 import os
 import requests
 import json
 from dotenv import load_dotenv
 load_dotenv()
 # API endpoint
 base_url = "http://localhost:5056"
 # Your API key
 api_key = os.getenv("API_KEY_ACCESS")
 # Headers
 headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
 }
 # Audio URL for testing
 audio_url = "https://s3.us-east-2.amazonaws.com/com.mkdlabs.images/baas/jordan/019933724441Business%20English%20Conversation%20Lesson%2045_%20Meeting%20a%20New%20Colleague.mp3"
 # Test function to compare freemium and pro plan transcriptions
 def test_plan_differences():
    # 1. Test the freemium plan (no speaker identification)
    freemium_payload = {
        "media_url": audio_url,
        "media_type": "audio",
        "plan_tier": "freemium"
    }
    print("Testing Freemium Plan (no speaker identification)...")
    freemium_response = requests.post(
        f"{base_url}/microdot-ai/transcribe", 
        headers=headers,
        json=freemium_payload
    )
    # 2. Test the pro plan (with speaker identification)
    pro_payload = {
        "media_url": audio_url,
        "media_type": "audio",
        "plan_tier": "pro"
    }
    print("Testing Pro Plan (with speaker identification)...")
    pro_response = requests.post(
        f"{base_url}/microdot-ai/transcribe", 
        headers=headers,
        json=pro_payload
    )
    # Check if both requests were successful
    if freemium_response.status_code == 200 and pro_response.status_code == 200:
        freemium_data = freemium_response.json()
        pro_data = pro_response.json()
        # Save the transcripts for inspection
        with open("freemium_transcript.json", "w") as f:
            f.write(json.dumps(freemium_data, indent=4))
        with open("pro_transcript.json", "w") as f:
            f.write(json.dumps(pro_data, indent=4))
        print("Transcripts saved to freemium_transcript.json and pro_transcript.json")
        # Check if the freemium plan has speaker information
        has_speaker_freemium = "speaker" in freemium_data["transcript"]["sentences"][0] if freemium_data["transcript"]["sentences"] else False
        # Check if the pro plan has speaker information
        has_speaker_pro = "speaker" in pro_data["transcript"]["sentences"][0] if pro_data["transcript"]["sentences"] else False
        print(f"Freemium plan has speaker information: {has_speaker_freemium}")
        print(f"Pro plan has speaker information: {has_speaker_pro}")
        # Verify the expected behavior
        if not has_speaker_freemium and has_speaker_pro:
            print("✅ Test PASSED: Freemium plan doesn't show speakers, Pro plan does.")
        else:
            print("❌ Test FAILED: Expected behavior not observed.")
    else:
        print(f"Freemium request status: {freemium_response.status_code}")
        print(f"Pro request status: {pro_response.status_code}")
        if freemium_response.status_code != 200:
            print(f"Freemium error: {freemium_response.text}")
        if pro_response.status_code != 200:
            print(f"Pro error: {pro_response.text}")
 if __name__ == "__main__":
    test_plan_differences()