fix: Restore NewsFetcher class in news_fetcher.py
- Fixed import error by restoring proper NewsFetcher class structure - Updated RSS feed fetching implementation with improved error handling - Enhanced feed parsing with better timeout management and user agents - Maintained compatibility with existing system architecture - Resolved server startup issues caused by missing class definition
This commit is contained in:
+183
-197
@@ -1,230 +1,216 @@
|
|||||||
"""AI Analysis module for DS Task AI News using Groq LLM"""
|
|
||||||
import os
|
"""RSS News Fetcher for DS Task AI News"""
|
||||||
from typing import Dict, List, Any, Optional
|
import feedparser
|
||||||
|
import requests
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import List, Dict, Any
|
||||||
try:
|
from urllib.parse import urlparse
|
||||||
from groq import Groq
|
import hashlib
|
||||||
GROQ_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
GROQ_AVAILABLE = False
|
|
||||||
print("⚠️ Groq not available - install with: pip install groq")
|
|
||||||
|
|
||||||
from config import settings
|
from config import settings
|
||||||
|
from recommender import NewsRecommender # Add this import
|
||||||
|
from ai_analyzer import AIAnalyzer # Add this import
|
||||||
|
|
||||||
class AIAnalyzer:
|
class NewsFetcher:
|
||||||
"""AI-powered article analysis using Groq LLM"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.client = None
|
self.raw_news_dir = settings.raw_news_dir
|
||||||
self.model = "llama3-8b-8192" # Fast Groq model
|
self.max_articles = settings.max_articles_per_feed
|
||||||
self.available = False
|
self.recommender = NewsRecommender() # Add recommender for embedding/vector access
|
||||||
|
self.ai_analyzer = AIAnalyzer() # Add AIAnalyzer for LLM duplicate check
|
||||||
|
# Ensure directories exist
|
||||||
|
os.makedirs(self.raw_news_dir, exist_ok=True)
|
||||||
|
|
||||||
if GROQ_AVAILABLE and settings.groq_api_key:
|
def generate_article_id(self, title: str, url: str) -> str:
|
||||||
try:
|
"""Generate unique ID for article"""
|
||||||
self.client = Groq(api_key=settings.groq_api_key)
|
content = f"{title}{url}"
|
||||||
self.available = True
|
return hashlib.md5(content.encode()).hexdigest()[:12]
|
||||||
print("✅ Groq AI Analyzer initialized successfully")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Groq initialization failed: {e}")
|
|
||||||
else:
|
|
||||||
print("⚠️ Groq AI Analyzer not available (missing API key or library)")
|
|
||||||
|
|
||||||
def _make_groq_request(self, prompt: str, max_tokens: int = 500) -> Optional[str]:
|
def clean_content(self, content: str) -> str:
|
||||||
"""Make a request to Groq API"""
|
"""Clean and truncate content"""
|
||||||
if not self.available:
|
if not content:
|
||||||
return None
|
return ""
|
||||||
|
|
||||||
try:
|
# Remove HTML tags (basic cleaning)
|
||||||
response = self.client.chat.completions.create(
|
import re
|
||||||
messages=[
|
content = re.sub(r'<[^>]+>', '', content)
|
||||||
{"role": "system", "content": "You are an expert news analyst. Provide concise, accurate analysis."},
|
|
||||||
{"role": "user", "content": prompt}
|
# Truncate to reasonable length
|
||||||
],
|
return content[:1000] if len(content) > 1000 else content
|
||||||
model=self.model,
|
|
||||||
max_tokens=max_tokens,
|
def is_duplicate_by_llm(self, article: Dict[str, Any], existing_article: Dict[str, Any]) -> bool:
|
||||||
temperature=0.3
|
"""Use LLM to check if two articles are about the same event or story"""
|
||||||
|
if not self.ai_analyzer.available:
|
||||||
|
return False # LLM not available, skip this check
|
||||||
|
prompt = f"""
|
||||||
|
Are these two news articles about the same event or story? Answer only 'yes' or 'no'.\n\nArticle 1:\nTitle: {article.get('title', '')}\nContent: {article.get('content', '')[:500]}\n\nArticle 2:\nTitle: {existing_article.get('title', '')}\nContent: {existing_article.get('content', '')[:500]}\n"""
|
||||||
|
response = self.ai_analyzer._make_groq_request(prompt, max_tokens=5)
|
||||||
|
if response and response.strip().lower().startswith('yes'):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_duplicate_by_similarity(self, article: Dict[str, Any], threshold: float = 0.9) -> bool:
|
||||||
|
"""Check if the article is a duplicate using similarity search and LLM verification"""
|
||||||
|
all_articles = self.recommender.vector_store.get_all_articles()
|
||||||
|
if not all_articles:
|
||||||
|
return False # No articles to compare with
|
||||||
|
embedding = self.recommender.embedding_generator.generate_query_embedding(
|
||||||
|
self.recommender.embedding_generator.create_article_text(article)
|
||||||
)
|
)
|
||||||
return response.choices[0].message.content.strip()
|
existing_embeddings = self.recommender.vector_store.index.reconstruct_n(0, len(all_articles))
|
||||||
|
import numpy as np
|
||||||
|
for idx, existing_embedding in enumerate(existing_embeddings):
|
||||||
|
norm1 = np.linalg.norm(embedding)
|
||||||
|
norm2 = np.linalg.norm(existing_embedding)
|
||||||
|
if norm1 == 0 or norm2 == 0:
|
||||||
|
continue
|
||||||
|
similarity = float(np.dot(embedding, existing_embedding) / (norm1 * norm2))
|
||||||
|
if similarity >= threshold:
|
||||||
|
# Use LLM to confirm duplicate
|
||||||
|
existing_article = all_articles[idx]
|
||||||
|
if self.is_duplicate_by_llm(article, existing_article):
|
||||||
|
return True # LLM confirms duplicate
|
||||||
|
return False
|
||||||
|
|
||||||
|
def fetch_rss_feed(self, feed_url: str) -> List[Dict[str, Any]]:
|
||||||
|
"""Fetch articles from a single RSS feed"""
|
||||||
|
try:
|
||||||
|
print(f"Fetching from: {feed_url}")
|
||||||
|
|
||||||
|
# Use requests with proper headers and timeout
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
response = requests.get(feed_url, headers=headers, timeout=15)
|
||||||
|
response.raise_for_status()
|
||||||
|
feed = feedparser.parse(response.content)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Groq API error: {e}")
|
print(f"HTTP request failed, trying direct feedparser: {e}")
|
||||||
return None
|
feed = feedparser.parse(feed_url)
|
||||||
|
|
||||||
def summarize_article(self, article: Dict[str, Any]) -> Dict[str, Any]:
|
if feed.bozo:
|
||||||
"""Generate AI summary of an article"""
|
print(f"Warning: Feed parsing issues for {feed_url}")
|
||||||
if not self.available:
|
if hasattr(feed, 'bozo_exception'):
|
||||||
return {"summary": "AI analysis not available", "available": False}
|
print(f"Bozo exception: {feed.bozo_exception}")
|
||||||
|
|
||||||
title = article.get('title', '')
|
articles = []
|
||||||
content = article.get('content', '')
|
source_name = getattr(feed.feed, 'title', urlparse(feed_url).netloc)
|
||||||
|
|
||||||
prompt = f"""
|
for entry in feed.entries[:self.max_articles]:
|
||||||
Analyze this news article and provide a concise summary:
|
|
||||||
|
|
||||||
Title: {title}
|
|
||||||
Content: {content[:1000]}...
|
|
||||||
|
|
||||||
Provide:
|
|
||||||
1. A 2-sentence summary
|
|
||||||
2. 3 key points
|
|
||||||
3. Main topic category
|
|
||||||
|
|
||||||
Format as JSON:
|
|
||||||
{{
|
|
||||||
"summary": "Brief 2-sentence summary",
|
|
||||||
"key_points": ["point1", "point2", "point3"],
|
|
||||||
"category": "Technology/Business/Science/etc"
|
|
||||||
}}
|
|
||||||
"""
|
|
||||||
|
|
||||||
response = self._make_groq_request(prompt, max_tokens=300)
|
|
||||||
|
|
||||||
if response:
|
|
||||||
try:
|
try:
|
||||||
analysis = json.loads(response)
|
# Extract article data
|
||||||
analysis["available"] = True
|
title = getattr(entry, 'title', 'No Title')
|
||||||
analysis["analyzed_at"] = datetime.now().isoformat()
|
content = getattr(entry, 'summary', getattr(entry, 'description', ''))
|
||||||
return analysis
|
url = getattr(entry, 'link', '')
|
||||||
except json.JSONDecodeError:
|
published = getattr(entry, 'published', '')
|
||||||
return {
|
|
||||||
"summary": response,
|
# Parse date
|
||||||
"available": True,
|
try:
|
||||||
"analyzed_at": datetime.now().isoformat()
|
if published:
|
||||||
|
pub_date = datetime(*entry.published_parsed[:6])
|
||||||
|
else:
|
||||||
|
pub_date = datetime.now()
|
||||||
|
except:
|
||||||
|
pub_date = datetime.now()
|
||||||
|
|
||||||
|
# Create article object
|
||||||
|
article = {
|
||||||
|
"id": self.generate_article_id(title, url),
|
||||||
|
"title": title,
|
||||||
|
"content": self.clean_content(content),
|
||||||
|
"url": url,
|
||||||
|
"source": source_name,
|
||||||
|
"published_date": pub_date.isoformat(),
|
||||||
|
"fetched_date": datetime.now().isoformat(),
|
||||||
|
"categories": getattr(entry, 'tags', []),
|
||||||
|
"slug": title.lower().replace(" ", "-").replace("'", "")[:50]
|
||||||
}
|
}
|
||||||
|
|
||||||
return {"summary": "Analysis failed", "available": False}
|
# Check for duplicate using similarity search
|
||||||
|
if self.is_duplicate_by_similarity(article):
|
||||||
|
print(f"Skipped duplicate article (similarity): {title}")
|
||||||
|
continue
|
||||||
|
|
||||||
def extract_keywords(self, article: Dict[str, Any]) -> List[str]:
|
articles.append(article)
|
||||||
"""Extract key terms and entities from article"""
|
|
||||||
if not self.available:
|
except Exception as e:
|
||||||
|
print(f"Error processing entry: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Fetched {len(articles)} articles from {source_name}")
|
||||||
|
|
||||||
|
# If no articles but feed parsed successfully, it might be due to no new content
|
||||||
|
if len(articles) == 0 and not feed.bozo:
|
||||||
|
print(f"No new articles found in {source_name} (feed is valid)")
|
||||||
|
|
||||||
|
return articles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error fetching RSS feed {feed_url}: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
title = article.get('title', '')
|
def fetch_all_news(self) -> List[Dict[str, Any]]:
|
||||||
content = article.get('content', '')
|
"""Fetch news from all configured RSS feeds"""
|
||||||
|
all_articles = []
|
||||||
|
|
||||||
prompt = f"""
|
for feed_url in settings.rss_feeds:
|
||||||
Extract the most important keywords and entities from this article:
|
feed_url = feed_url.strip()
|
||||||
|
if feed_url:
|
||||||
|
articles = self.fetch_rss_feed(feed_url)
|
||||||
|
all_articles.extend(articles)
|
||||||
|
|
||||||
Title: {title}
|
# Remove duplicates based on ID
|
||||||
Content: {content[:800]}...
|
unique_articles = {}
|
||||||
|
for article in all_articles:
|
||||||
|
unique_articles[article['id']] = article
|
||||||
|
|
||||||
Return only a JSON array of 5-8 most relevant keywords:
|
final_articles = list(unique_articles.values())
|
||||||
["keyword1", "keyword2", "keyword3", ...]
|
print(f"Total unique articles fetched: {len(final_articles)}")
|
||||||
"""
|
|
||||||
|
|
||||||
response = self._make_groq_request(prompt, max_tokens=100)
|
return final_articles
|
||||||
|
|
||||||
if response:
|
def save_articles(self, articles: List[Dict[str, Any]]) -> str:
|
||||||
try:
|
"""Save articles to JSON file"""
|
||||||
keywords = json.loads(response)
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
return keywords if isinstance(keywords, list) else []
|
filename = f"news_{timestamp}.json"
|
||||||
except json.JSONDecodeError:
|
|
||||||
# Fallback: extract from response text
|
|
||||||
words = response.replace('[', '').replace(']', '').replace('"', '').split(',')
|
|
||||||
return [word.strip() for word in words[:8]]
|
|
||||||
|
|
||||||
return []
|
# Normalize the path to avoid double backslashes
|
||||||
|
raw_news_dir = os.path.normpath(self.raw_news_dir)
|
||||||
|
filepath = os.path.normpath(os.path.join(raw_news_dir, filename))
|
||||||
|
|
||||||
def analyze_sentiment(self, article: Dict[str, Any]) -> Dict[str, Any]:
|
# Ensure directory exists
|
||||||
"""Analyze sentiment and tone of article"""
|
os.makedirs(raw_news_dir, exist_ok=True)
|
||||||
if not self.available:
|
|
||||||
return {"sentiment": "neutral", "confidence": 0.0, "available": False}
|
|
||||||
|
|
||||||
title = article.get('title', '')
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
content = article.get('content', '')
|
json.dump(articles, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
prompt = f"""
|
print(f"Saved {len(articles)} articles to {filepath}")
|
||||||
Analyze the sentiment and tone of this news article:
|
return filepath
|
||||||
|
|
||||||
Title: {title}
|
def fetch_and_save_news(self) -> Dict[str, Any]:
|
||||||
Content: {content[:600]}...
|
"""Fetch news and save to file"""
|
||||||
|
articles = self.fetch_all_news()
|
||||||
|
|
||||||
Return JSON with:
|
if articles:
|
||||||
{{
|
filepath = self.save_articles(articles)
|
||||||
"sentiment": "positive/negative/neutral",
|
|
||||||
"confidence": 0.85,
|
|
||||||
"tone": "informative/urgent/optimistic/concerned/etc",
|
|
||||||
"reasoning": "Brief explanation"
|
|
||||||
}}
|
|
||||||
"""
|
|
||||||
|
|
||||||
response = self._make_groq_request(prompt, max_tokens=150)
|
|
||||||
|
|
||||||
if response:
|
|
||||||
try:
|
|
||||||
sentiment = json.loads(response)
|
|
||||||
sentiment["available"] = True
|
|
||||||
return sentiment
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
return {
|
return {
|
||||||
"sentiment": "neutral",
|
"success": True,
|
||||||
"confidence": 0.5,
|
"articles_count": len(articles),
|
||||||
"tone": "informative",
|
"filepath": filepath,
|
||||||
"reasoning": response,
|
"articles": articles
|
||||||
"available": True
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"articles_count": 0,
|
||||||
|
"message": "No articles fetched"
|
||||||
}
|
}
|
||||||
|
|
||||||
return {"sentiment": "neutral", "confidence": 0.0, "available": False}
|
# Test function
|
||||||
|
if __name__ == "__main__":
|
||||||
def generate_insights(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
|
fetcher = NewsFetcher()
|
||||||
"""Generate insights from multiple articles"""
|
result = fetcher.fetch_and_save_news()
|
||||||
if not self.available or not articles:
|
print(f"Result: {result}")
|
||||||
return {"insights": "AI insights not available", "available": False}
|
|
||||||
|
|
||||||
# Prepare article summaries
|
|
||||||
article_summaries = []
|
|
||||||
for i, article in enumerate(articles[:5]): # Limit to 5 articles
|
|
||||||
title = article.get('title', '')
|
|
||||||
source = article.get('source', '')
|
|
||||||
article_summaries.append(f"{i+1}. {title} (Source: {source})")
|
|
||||||
|
|
||||||
prompt = f"""
|
|
||||||
Analyze these recent news articles and provide insights:
|
|
||||||
|
|
||||||
Articles:
|
|
||||||
{chr(10).join(article_summaries)}
|
|
||||||
|
|
||||||
Provide:
|
|
||||||
1. Main trends or themes
|
|
||||||
2. Key developments
|
|
||||||
3. Potential implications
|
|
||||||
|
|
||||||
Format as JSON:
|
|
||||||
{{
|
|
||||||
"trends": ["trend1", "trend2"],
|
|
||||||
"key_developments": ["development1", "development2"],
|
|
||||||
"implications": "Brief analysis of what this means"
|
|
||||||
}}
|
|
||||||
"""
|
|
||||||
|
|
||||||
response = self._make_groq_request(prompt, max_tokens=400)
|
|
||||||
|
|
||||||
if response:
|
|
||||||
try:
|
|
||||||
insights = json.loads(response)
|
|
||||||
insights["available"] = True
|
|
||||||
insights["analyzed_at"] = datetime.now().isoformat()
|
|
||||||
insights["article_count"] = len(articles)
|
|
||||||
return insights
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
return {
|
|
||||||
"insights": response,
|
|
||||||
"available": True,
|
|
||||||
"analyzed_at": datetime.now().isoformat()
|
|
||||||
}
|
|
||||||
|
|
||||||
return {"insights": "Analysis failed", "available": False}
|
|
||||||
|
|
||||||
def get_status(self) -> Dict[str, Any]:
|
|
||||||
"""Get AI analyzer status"""
|
|
||||||
return {
|
|
||||||
"available": self.available,
|
|
||||||
"model": self.model if self.available else None,
|
|
||||||
"features": [
|
|
||||||
"Article Summarization",
|
|
||||||
"Keyword Extraction",
|
|
||||||
"Sentiment Analysis",
|
|
||||||
"Trend Insights"
|
|
||||||
] if self.available else []
|
|
||||||
}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user