diff --git a/API_DOCUMENTATION.md b/API_DOCUMENTATION.md index dd8a587..befa3a6 100644 --- a/API_DOCUMENTATION.md +++ b/API_DOCUMENTATION.md @@ -33,11 +33,12 @@ Transcribes audio or video content from a URL. ```json { "media_url": "https://example.com/audio-file.mp3", - "media_type": "audio" // "audio" or "video" + "media_type": "audio", // "audio" or "video" + "plan_tier": "freemium" // "freemium" or "pro" (optional, defaults to "freemium") } ``` -**Response:** +**Pro Plan Response (with speaker identification):** ```json { @@ -45,7 +46,33 @@ Transcribes audio or video content from a URL. "sentences": [ { "sentence": "Hello and welcome to the meeting.", - "speaker": "speaker_0", + "speaker": "speaker_0", // Speaker identification included + "start": 0.0, + "end": 2.5, + "words": [ + { + "word": "Hello", + "start": 0.0, + "end": 0.4 + }, + // Additional words... + ] + }, + // Additional sentences... + ] + } +} +``` + +**Freemium Plan Response (without speaker identification):** + +```json +{ + "transcript": { + "sentences": [ + { + "sentence": "Hello and welcome to the meeting.", + // No "speaker" field in freemium plan "start": 0.0, "end": 2.5, "words": [ @@ -271,14 +298,15 @@ headers = { "Content-Type": "application/json" } -# 1. Transcribe an audio file +# 1. Transcribe an audio file (Pro plan with speaker identification) transcribe_payload = { "media_url": "https://example.com/meeting-recording.mp3", - "media_type": "audio" + "media_type": "audio", + "plan_tier": "pro" # Specify "pro" for speaker identification or "freemium" for no speakers } transcribe_response = requests.post( - f"{base_url}/microdot-ai/transcribe", + f"{base_url}/microdot-ai/transcribe", headers=headers, json=transcribe_payload ) @@ -287,19 +315,19 @@ transcribe_response = requests.post( if transcribe_response.status_code == 200: transcript_data = transcribe_response.json() transcript_json = json.dumps(transcript_data["transcript"]) - + # 2. Generate a summary (Pro plan) summary_payload = { "transcript": transcript_json, "plan_tier": "pro" } - + summary_response = requests.post( - f"{base_url}/microdot-ai/general-summary", + f"{base_url}/microdot-ai/general-summary", headers=headers, json=summary_payload ) - + if summary_response.status_code == 200: summary_data = summary_response.json() print("Summary generated successfully!") @@ -316,7 +344,7 @@ else: const axios = require('axios'); // API endpoint -const baseUrl = 'https://api.microdot.ai'; +const baseUrl = 'http://0.0.0.0:5056'; // Your API key const apiKey = 'your_api_key_here'; @@ -331,34 +359,35 @@ const headers = { const transcribeAudio = async () => { const transcribePayload = { media_url: 'https://example.com/meeting-recording.mp3', - media_type: 'audio' + media_type: 'audio', + plan_tier: 'pro' // Specify 'pro' for speaker identification or 'freemium' for no speakers }; - + try { const transcribeResponse = await axios.post( `${baseUrl}/microdot-ai/transcribe`, transcribePayload, { headers } ); - + const transcriptData = transcribeResponse.data; const transcriptJson = JSON.stringify(transcriptData.transcript); - + // 2. Generate a summary (Pro plan) const summaryPayload = { transcript: transcriptJson, plan_tier: 'pro' }; - + const summaryResponse = await axios.post( `${baseUrl}/microdot-ai/general-summary`, summaryPayload, { headers } ); - + console.log('Summary generated successfully!'); console.log(JSON.stringify(summaryResponse.data.transcript, null, 2)); - + } catch (error) { console.error('Error:', error.response ? error.response.data : error.message); } diff --git a/app.py b/app.py index 174a9ba..d246ffb 100644 --- a/app.py +++ b/app.py @@ -55,6 +55,7 @@ async def get_api_key(api_key_header: str = Security(api_key_header)) -> str: class TranscribeRequest(BaseModel): media_url: Optional[str] = None media_type: Optional[str] # Corrected type hint for media_type + plan_tier: Optional[str] = "freemium" # Default to freemium plan if not specified class ChatResp(BaseModel): # Added BaseModel inheritance error: Optional[str] = None @@ -78,6 +79,16 @@ async def chat_endpoint( api_key: str = Depends(get_api_key) ): try: + # Get the plan tier from the request or default to freemium + plan_tier = request.plan_tier.lower() if request.plan_tier else "freemium" + + # Validate plan tier using our PlanTier enum + valid_tiers = [t.value for t in PlanTier] + if plan_tier not in valid_tiers: + plan_tier = PlanTier.FREEMIUM.value # Default to freemium if invalid tier + + # Check if the plan includes speaker identification + include_speakers = PlanLimits.get_limit(plan_tier, "speaker_identification") # Use the transcribe_media function to transcribe the media if request.media_url: @@ -88,7 +99,7 @@ async def chat_endpoint( # Parse response words = transcription_response["results"]["channels"][0]["alternatives"][0]["words"] - transcript = group_words_into_sentences(words=words) + transcript = group_words_into_sentences(words=words, include_speakers=include_speakers) return TranscriptResponse( transcript=transcript, # Corrected to return the transcript error=None diff --git a/scripts/transcriber.py b/scripts/transcriber.py index 6c6fe4a..2312b46 100644 --- a/scripts/transcriber.py +++ b/scripts/transcriber.py @@ -28,7 +28,7 @@ def extract_audio(url: str, output_template=os.path.join(UPLOAD_FOLDER, "%(title """ Download and extract audio from a video URL using yt-dlp. The file will be saved in the 'upload' folder. - + Returns: str: The absolute path to the downloaded audio file (with a unique id appended). """ @@ -42,51 +42,51 @@ def extract_audio(url: str, output_template=os.path.join(UPLOAD_FOLDER, "%(title }], "quiet": True, } - + with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=True) # Prepare the filename from the info. # Note: prepare_filename returns the filename *before* postprocessing, # so we change the extension to mp3. original_filepath = os.path.splitext(ydl.prepare_filename(info))[0] + ".mp3" - + # Debug: list files in the upload folder if not os.path.exists(original_filepath): files = os.listdir(UPLOAD_FOLDER) print("Warning: Could not find expected file.") print("Files in upload folder:", files) raise FileNotFoundError(f"Expected audio file not found: {original_filepath}") - + # Get the video's title and sanitize it title = info.get('title', 'audio') safe_title = sanitize_filename(title) - + # Generate a unique identifier unique_id = uuid.uuid4().hex # Unique identifier in hex format - + # Construct the new filename with the unique id appended. new_audio_filename = f"{safe_title}_{unique_id}.mp3" new_audio_filepath = os.path.join(UPLOAD_FOLDER, new_audio_filename) - + # Rename the downloaded file to include the unique ID. os.rename(original_filepath, new_audio_filepath) print(f"Renamed file to: {new_audio_filepath}") - + # Return the absolute path to the renamed audio file. return os.path.abspath(new_audio_filepath) def transcribe_media(file_loc: str, media_type: str = "audio"): """ Transcribe media using Deepgram. - + If media_type is "audio" (remote URL), use Deepgram's URL transcription. If media_type is "video" (remote URL), extract audio locally (in the upload folder), transcribe via file, and then delete the local audio file. - + Args: file_loc (str): URL to the remote audio or video file. media_type (str): "audio" or "video". - + Returns: dict: The transcription response from Deepgram. """ @@ -100,22 +100,22 @@ def transcribe_media(file_loc: str, media_type: str = "audio"): smart_format=True, diarize=True, ) - + if media_type.lower() == "audio": # For remote audio files, use the URL transcription method. response = deepgram.listen.rest.v("1").transcribe_url({"url": file_loc}, options) - + elif media_type.lower() == "video": # For remote video files, first extract the audio locally. local_audio_path = extract_audio(file_loc) print(f"Extracted audio to: {local_audio_path}") - + # Transcribe using the local file method. with open(local_audio_path, "rb") as file: buffer_data = file.read() payload: FileSource = {"buffer": buffer_data} response = deepgram.listen.rest.v("1").transcribe_file(payload, options) - + # Clean up: delete the local audio file. if os.path.exists(local_audio_path): os.remove(local_audio_path) @@ -123,7 +123,7 @@ def transcribe_media(file_loc: str, media_type: str = "audio"): else: raise ValueError("media_type must be either 'audio' or 'video'.") - + return response except Exception as e: @@ -137,54 +137,76 @@ def transcribe_media(file_loc: str, media_type: str = "audio"): -def group_words_into_sentences(words, max_words=15): +def group_words_into_sentences(words, max_words=15, include_speakers=True): + """ + Group words into sentences based on speaker changes. + + Args: + words: List of word objects from the transcription + max_words: Maximum number of words per sentence + include_speakers: Whether to include speaker information in the output + (True for Pro plan, False for Freemium plan) + + Returns: + A dictionary containing the sentences + """ sentences = [] current_sentence = [] current_speaker = None start_time = None - + for i, word_info in enumerate(words): word = word_info["punctuated_word"] - speaker = word_info["speaker"] + speaker = word_info["speaker"] if include_speakers else "speaker_0" # Use a default speaker if not including speakers start = word_info["start"] end = word_info["end"] - + # If speaker changes or sentence reaches max length, start a new sentence if speaker != current_speaker: if current_sentence: - sentences.append({ + sentence_obj = { "sentence": " ".join([w["word"] for w in current_sentence]), - "speaker": current_speaker, "start": start_time, "end": words[i-1]["end"], "words": current_sentence - }) + } + + # Only include speaker information if include_speakers is True + if include_speakers: + sentence_obj["speaker"] = current_speaker + + sentences.append(sentence_obj) current_sentence = [] current_speaker = speaker start_time = start - + # Append word with metadata inside the current sentence current_sentence.append({"word": word, "start": start, "end": end}) # Append the last sentence if any words remain if current_sentence: - sentences.append({ + sentence_obj = { "sentence": " ".join([w["word"] for w in current_sentence]), - "speaker": current_speaker, "start": start_time, "end": words[-1]["end"], "words": current_sentence - }) - + } + + # Only include speaker information if include_speakers is True + if include_speakers: + sentence_obj["speaker"] = current_speaker + + sentences.append(sentence_obj) + return {"sentences": sentences} if __name__ == "__main__": - + audio_url = "https://s3.us-east-2.amazonaws.com/com.mkdlabs.images/baas/jordan/019933724441Business%20English%20Conversation%20Lesson%2045_%20Meeting%20a%20New%20Colleague.mp3" video_url = "https://s3.us-east-2.amazonaws.com/com.mkdlabs.images/baas/jordan/038426704141Business%20English%20Conversation%20Lesson%2045_%20%20Meeting%20a%20New%20Colleague.mp4" # Folder for file uploads/downloads - + response = transcribe_media(video_url, media_type="video") print(response) \ No newline at end of file diff --git a/src/models.py b/src/models.py index fe530cd..01852ed 100644 --- a/src/models.py +++ b/src/models.py @@ -15,13 +15,15 @@ class PlanLimits: "transcription_minutes": 200, "summary_type": "basic", "transcript_history_days": 7, - "integrations": ["google_meet", "zoom"] + "integrations": ["google_meet", "zoom"], + "speaker_identification": False }, PlanTier.PRO: { "transcription_minutes": 600, "summary_type": "advanced", "transcript_history_days": 30, - "integrations": ["google_meet", "zoom", "slack", "notion", "asana", "microsoft_teams"] + "integrations": ["google_meet", "zoom", "slack", "notion", "asana", "microsoft_teams"], + "speaker_identification": True } } diff --git a/test1.py b/test1.py index 00f9270..0bd4159 100644 --- a/test1.py +++ b/test1.py @@ -19,33 +19,76 @@ headers = { # Audio URL from your notebook audio_url = "https://s3.us-east-2.amazonaws.com/com.mkdlabs.images/baas/jordan/019933724441Business%20English%20Conversation%20Lesson%2045_%20Meeting%20a%20New%20Colleague.mp3" -# 1. First, transcribe the audio -transcribe_payload = { +# 1. First, transcribe the audio with the Pro plan (with speaker identification) +pro_transcribe_payload = { "media_url": audio_url, - "media_type": "audio" + "media_type": "audio", + "plan_tier": "pro" # Specify the pro plan to include speaker identification } -transcribe_response = requests.post( - f"{base_url}/microdot-ai/transcribe", +pro_transcribe_response = requests.post( + f"{base_url}/microdot-ai/transcribe", headers=headers, - json=transcribe_payload + json=pro_transcribe_payload ) -# Check if transcription was successful -if transcribe_response.status_code == 200: - transcript_data = transcribe_response.json() - print("Transcription successful!") - - # Save the transcript for later use - transcript_json = json.dumps(transcript_data["transcript"], indent=4) - # Save the transcript to a file - with open("transcript.json", "w") as f: - f.write(transcript_json) - print("Transcript saved to transcript.json") +# Check if Pro plan transcription was successful +if pro_transcribe_response.status_code == 200: + pro_transcript_data = pro_transcribe_response.json() + print("Pro plan transcription successful!") + # Save the Pro plan transcript for later use + pro_transcript_json = json.dumps(pro_transcript_data["transcript"], indent=4) + # Save the Pro plan transcript to a file + with open("pro_transcript.json", "w") as f: + f.write(pro_transcript_json) + print("Pro plan transcript saved to pro_transcript.json") + + # Check if the Pro plan transcript has speaker information + has_speaker_pro = "speaker" in pro_transcript_data["transcript"]["sentences"][0] if pro_transcript_data["transcript"]["sentences"] else False + print(f"Pro plan has speaker information: {has_speaker_pro}") else: - print(f"Transcription failed with status code: {transcribe_response.status_code}") - print(transcribe_response.text) + print(f"Pro plan transcription failed with status code: {pro_transcribe_response.status_code}") + print(pro_transcribe_response.text) + +# 1b. Now transcribe with the Free plan (without speaker identification) +free_transcribe_payload = { + "media_url": audio_url, + "media_type": "audio", + "plan_tier": "freemium" # Specify the freemium plan to exclude speaker identification +} + +free_transcribe_response = requests.post( + f"{base_url}/microdot-ai/transcribe", + headers=headers, + json=free_transcribe_payload +) + +# Check if Free plan transcription was successful +if free_transcribe_response.status_code == 200: + free_transcript_data = free_transcribe_response.json() + print("Free plan transcription successful!") + + # Save the Free plan transcript for later use + free_transcript_json = json.dumps(free_transcript_data["transcript"], indent=4) + # Save the Free plan transcript to a file + with open("free_transcript.json", "w") as f: + f.write(free_transcript_json) + print("Free plan transcript saved to free_transcript.json") + + # Check if the Free plan transcript has speaker information + has_speaker_free = "speaker" in free_transcript_data["transcript"]["sentences"][0] if free_transcript_data["transcript"]["sentences"] else False + print(f"Free plan has speaker information: {has_speaker_free}") + + # Use the Pro plan transcript for the summary tests + transcript_json = pro_transcript_json +else: + print(f"Free plan transcription failed with status code: {free_transcribe_response.status_code}") + print(free_transcribe_response.text) + + # If Free plan fails but Pro plan succeeded, use Pro plan transcript for summary tests + if pro_transcribe_response.status_code == 200: + transcript_json = pro_transcript_json @@ -56,7 +99,7 @@ basic_summary_payload = { } basic_summary_response = requests.post( - f"{base_url}/microdot-ai/general-summary", + f"{base_url}/microdot-ai/general-summary", headers=headers, json=basic_summary_payload ) @@ -83,7 +126,7 @@ advanced_summary_payload = { } advanced_summary_response = requests.post( - f"{base_url}/microdot-ai/general-summary", + f"{base_url}/microdot-ai/general-summary", headers=headers, json=advanced_summary_payload ) @@ -93,7 +136,7 @@ if advanced_summary_response.status_code == 200: advanced_summary_data = advanced_summary_response.json() print("\n--- Advanced (Pro) Summary ---") advanced_summary_json = json.dumps(advanced_summary_data, indent=2) - + with open("advanced_summary.json", "w") as f: f.write(advanced_summary_json) print("Advanced summary saved to advanced_summary.json") diff --git a/test_plans.py b/test_plans.py new file mode 100644 index 0000000..de182fa --- /dev/null +++ b/test_plans.py @@ -0,0 +1,90 @@ +import os +import requests +import json +from dotenv import load_dotenv +load_dotenv() + +# API endpoint +base_url = "http://localhost:5056" + +# Your API key +api_key = os.getenv("API_KEY_ACCESS") + +# Headers +headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" +} + +# Audio URL for testing +audio_url = "https://s3.us-east-2.amazonaws.com/com.mkdlabs.images/baas/jordan/019933724441Business%20English%20Conversation%20Lesson%2045_%20Meeting%20a%20New%20Colleague.mp3" + +# Test function to compare freemium and pro plan transcriptions +def test_plan_differences(): + # 1. Test the freemium plan (no speaker identification) + freemium_payload = { + "media_url": audio_url, + "media_type": "audio", + "plan_tier": "freemium" + } + + print("Testing Freemium Plan (no speaker identification)...") + freemium_response = requests.post( + f"{base_url}/microdot-ai/transcribe", + headers=headers, + json=freemium_payload + ) + + # 2. Test the pro plan (with speaker identification) + pro_payload = { + "media_url": audio_url, + "media_type": "audio", + "plan_tier": "pro" + } + + print("Testing Pro Plan (with speaker identification)...") + pro_response = requests.post( + f"{base_url}/microdot-ai/transcribe", + headers=headers, + json=pro_payload + ) + + # Check if both requests were successful + if freemium_response.status_code == 200 and pro_response.status_code == 200: + freemium_data = freemium_response.json() + pro_data = pro_response.json() + + # Save the transcripts for inspection + with open("freemium_transcript.json", "w") as f: + f.write(json.dumps(freemium_data, indent=4)) + + with open("pro_transcript.json", "w") as f: + f.write(json.dumps(pro_data, indent=4)) + + print("Transcripts saved to freemium_transcript.json and pro_transcript.json") + + # Check if the freemium plan has speaker information + has_speaker_freemium = "speaker" in freemium_data["transcript"]["sentences"][0] if freemium_data["transcript"]["sentences"] else False + + # Check if the pro plan has speaker information + has_speaker_pro = "speaker" in pro_data["transcript"]["sentences"][0] if pro_data["transcript"]["sentences"] else False + + print(f"Freemium plan has speaker information: {has_speaker_freemium}") + print(f"Pro plan has speaker information: {has_speaker_pro}") + + # Verify the expected behavior + if not has_speaker_freemium and has_speaker_pro: + print("✅ Test PASSED: Freemium plan doesn't show speakers, Pro plan does.") + else: + print("❌ Test FAILED: Expected behavior not observed.") + + else: + print(f"Freemium request status: {freemium_response.status_code}") + print(f"Pro request status: {pro_response.status_code}") + if freemium_response.status_code != 200: + print(f"Freemium error: {freemium_response.text}") + if pro_response.status_code != 200: + print(f"Pro error: {pro_response.text}") + +if __name__ == "__main__": + test_plan_differences()