75a0a3fde7
- Added `get_latest_email_date()` function in `database.py` to retrieve the most recent email date for a given account and folder. - Enhanced `fetch_folder_emails()` in `zoho_client.py` to intelligently determine the start date for fetching emails based on the latest email date in the database. - Introduced `analyze_and_update_threads_async()` for asynchronous analysis of email threads, allowing concurrent processing. - Created a synchronous wrapper `analyze_and_update_threads()` for easier integration. - Updated `fetch_emails()` to support database session and account email parameters. - Added comprehensive documentation in `AI_ANALYSIS_GUIDE.md` detailing the new AI analysis functionality. - Implemented tests for the new features, including `test_fetch_with_db.py`, `test_ai_analysis.py`, and `test_single_analysis.py`. - Added error handling and logging improvements throughout the codebase.
169 lines
6.1 KiB
Python
169 lines
6.1 KiB
Python
import json
|
|
import os
|
|
import time
|
|
from typing import Dict, List
|
|
|
|
from groq import Groq
|
|
|
|
# Rate limiting variables
|
|
_last_api_call = 0
|
|
_min_interval = 1.0 # Minimum seconds between API calls
|
|
|
|
|
|
def _rate_limit():
|
|
"""Simple rate limiting to avoid overwhelming the Groq API."""
|
|
global _last_api_call
|
|
current_time = time.time()
|
|
time_since_last = current_time - _last_api_call
|
|
|
|
if time_since_last < _min_interval:
|
|
sleep_time = _min_interval - time_since_last
|
|
time.sleep(sleep_time)
|
|
|
|
_last_api_call = time.time()
|
|
|
|
|
|
def _format_messages_for_context(messages: List[dict]) -> str:
|
|
lines = []
|
|
for m in messages:
|
|
direction = "IN" if m.get("is_incoming", True) else "OUT"
|
|
date = m.get("date_sent")
|
|
subj = m.get("subject") or ""
|
|
from_email = m.get("from_email") or ""
|
|
to_email = m.get("to_email") or ""
|
|
body = (m.get("body") or "").strip()
|
|
if len(body) > 1000:
|
|
body = body[:1000] + "..."
|
|
lines.append(
|
|
f"[{date}] [{direction}] {from_email} -> {to_email}\nSubject: {subj}\n{body}"
|
|
)
|
|
return "\n\n---\n\n".join(lines)
|
|
|
|
|
|
def _heuristic_analyze(messages: List[dict]) -> Dict:
|
|
# Simple fallback if Groq isn't available
|
|
body_concat = "\n\n".join([(m.get("body") or "") for m in messages[-4:]])
|
|
question_like = "?" in body_concat or any(
|
|
kw in body_concat.lower()
|
|
for kw in ["could you", "can you", "please", "let me know", "need", "request"]
|
|
)
|
|
last_subj = (messages[-1].get("subject") or "") if messages else ""
|
|
return {
|
|
"actionable": bool(question_like),
|
|
"summary": (body_concat[:350] + "...")
|
|
if len(body_concat) > 350
|
|
else body_concat,
|
|
"subject": last_subj,
|
|
"confidence": 0.35,
|
|
"model": "heuristic",
|
|
}
|
|
|
|
|
|
def analyze_thread(
|
|
thread_subject: str, messages: List[dict], max_messages: int = 4
|
|
) -> Dict:
|
|
"""
|
|
Analyze a thread using Groq LLM. Returns dict with keys:
|
|
- actionable: bool
|
|
- summary: str
|
|
- subject: str
|
|
- confidence: float (0..1)
|
|
- model: str
|
|
Gracefully falls back to a heuristic when GROQ_API_KEY is missing or calls fail.
|
|
"""
|
|
msgs = messages[-max_messages:] if max_messages else messages
|
|
|
|
api_key = os.getenv("GROQ_API_KEY")
|
|
if not api_key:
|
|
return _heuristic_analyze(msgs)
|
|
|
|
client = Groq(api_key=api_key)
|
|
|
|
system_prompt = (
|
|
"You are a helpful assistant that triages email threads and writes concise summaries. "
|
|
"Decide if the thread requires a reply from our side now, based on the last few messages. "
|
|
"Ignore newsletters/automations (e.g., from no-reply), and focus on whether there's a clear question or request. "
|
|
"Return a strict JSON object with keys: actionable (true/false), summary (<= 80 words), confidence (0..1)."
|
|
)
|
|
|
|
user_prompt = (
|
|
f"Thread subject: {thread_subject or ''}\n\n"
|
|
"Recent messages (oldest to newest):\n\n"
|
|
f"{_format_messages_for_context(msgs)}\n\n"
|
|
"Respond with only JSON, no extra commentary."
|
|
)
|
|
|
|
try:
|
|
# Validate input before sending to API
|
|
if not msgs:
|
|
return _heuristic_analyze(msgs)
|
|
|
|
# Check message content length to avoid oversized requests
|
|
formatted_context = _format_messages_for_context(msgs)
|
|
if len(formatted_context) > 10000: # Limit context size
|
|
# Truncate messages if too long
|
|
truncated_msgs = msgs[-2:] # Use only last 2 messages
|
|
formatted_context = _format_messages_for_context(truncated_msgs)
|
|
print(
|
|
f"Warning: Truncated message context due to length ({len(formatted_context)} chars)"
|
|
)
|
|
|
|
user_prompt = (
|
|
f"Thread subject: {thread_subject or ''}\n\n"
|
|
"Recent messages (oldest to newest):\n\n"
|
|
f"{formatted_context}\n\n"
|
|
"Respond with only JSON, no extra commentary."
|
|
)
|
|
|
|
# Validate prompt length
|
|
total_prompt_length = len(system_prompt) + len(user_prompt)
|
|
if total_prompt_length > 15000: # Further reduce if still too long
|
|
print(
|
|
f"Warning: Prompt too long ({total_prompt_length} chars), falling back to heuristic"
|
|
)
|
|
return _heuristic_analyze(msgs)
|
|
|
|
# Apply rate limiting before API call
|
|
_rate_limit()
|
|
|
|
completion = client.chat.completions.create(
|
|
model=os.getenv("GROQ_MODEL", "llama3-8b-8192"),
|
|
messages=[
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
temperature=0.2,
|
|
max_tokens=300,
|
|
)
|
|
content = completion.choices[0].message.content.strip()
|
|
# Attempt to extract JSON
|
|
data = json.loads(content)
|
|
data.setdefault("subject", thread_subject or "")
|
|
data.setdefault("model", os.getenv("GROQ_MODEL", "llama-3.1-70b-versatile"))
|
|
# Basic validation
|
|
if not isinstance(data.get("actionable"), bool) or not isinstance(
|
|
data.get("summary"), str
|
|
):
|
|
raise ValueError("Invalid schema from model")
|
|
return data
|
|
except json.JSONDecodeError as e:
|
|
print(f"JSON decode error from Groq API: {e}")
|
|
return _heuristic_analyze(msgs)
|
|
except Exception as e:
|
|
# Log the specific error for debugging
|
|
error_msg = str(e)
|
|
print(f"Groq API error: {error_msg}")
|
|
|
|
# Check for specific error types
|
|
if "400" in error_msg or "Bad Request" in error_msg:
|
|
print("400 Bad Request - likely prompt too long or invalid format")
|
|
elif "429" in error_msg or "rate limit" in error_msg.lower():
|
|
print("Rate limit exceeded - consider reducing concurrent requests")
|
|
elif "401" in error_msg or "unauthorized" in error_msg.lower():
|
|
print("Unauthorized - check GROQ_API_KEY")
|
|
elif "503" in error_msg or "service unavailable" in error_msg.lower():
|
|
print("Service unavailable - Groq API may be down")
|
|
|
|
# Fallback to heuristic
|
|
return _heuristic_analyze(msgs)
|