fix: Improve RSS feed fetching with better error handling and user agents

- Added proper User-Agent headers to avoid blocking by RSS servers - Implemented fallback mechanism: HTTP request with headers -> direct feedparser - Extended timeout to 15 seconds for better reliability - Enhanced error logging with detailed feed parsing information - Improved handling of 'bozo' (malformed) feeds with better reporting - Added informative messages for feeds with no new content This resolves RSS fetching issues and improves news aggregation reliability.
2025-07-15 20:41:46 +01:00
parent ecd24ce2a6
commit 508270e732
1 changed files with 221 additions and 164 deletions
@@ -1,173 +1,230 @@
-"""RSS News Fetcher for DS Task AI News"""
+"""AI Analysis module for DS Task AI News using Groq LLM"""
 import feedparser
 import requests
 import json
 import os
 from typing import Dict, List, Any, Optional
 import json
 from datetime import datetime
-from typing import List, Dict, Any
+
-from urllib.parse import urlparse
+try:
-import hashlib
+    from groq import Groq
    GROQ_AVAILABLE = True
 except ImportError:
    GROQ_AVAILABLE = False
    print("⚠️  Groq not available - install with: pip install groq")
 from config import settings
-class NewsFetcher:
+class AIAnalyzer:
    """AI-powered article analysis using Groq LLM"""
    def __init__(self):
-        self.raw_news_dir = settings.raw_news_dir
+        self.client = None
-        self.max_articles = settings.max_articles_per_feed
+        self.model = "llama3-8b-8192"  # Fast Groq model
-        
+        self.available = False
        # Ensure directories exist
        os.makedirs(self.raw_news_dir, exist_ok=True)
    def generate_article_id(self, title: str, url: str) -> str:
        """Generate unique ID for article"""
        content = f"{title}{url}"
        return hashlib.md5(content.encode()).hexdigest()[:12]
    def clean_content(self, content: str) -> str:
        """Clean and truncate content"""
        if not content:
            return ""
        # Remove HTML tags (basic cleaning)
        import re
        content = re.sub(r'<[^>]+>', '', content)
        # Truncate to reasonable length
        return content[:1000] if len(content) > 1000 else content
    def fetch_rss_feed(self, feed_url: str) -> List[Dict[str, Any]]:
        """Fetch articles from a single RSS feed"""
        try:
            print(f"Fetching from: {feed_url}")
            # Use requests with proper headers and timeout
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
        if GROQ_AVAILABLE and settings.groq_api_key:
            try:
-                import requests
+                self.client = Groq(api_key=settings.groq_api_key)
-                response = requests.get(feed_url, headers=headers, timeout=15)
+                self.available = True
-                response.raise_for_status()
+                print("✅ Groq AI Analyzer initialized successfully")
                feed = feedparser.parse(response.content)
            except Exception as e:
-                print(f"HTTP request failed, trying direct feedparser: {e}")
+                print(f"❌ Groq initialization failed: {e}")
-                feed = feedparser.parse(feed_url)
+        else:
            print("⚠️  Groq AI Analyzer not available (missing API key or library)")
-            if feed.bozo:
+    def _make_groq_request(self, prompt: str, max_tokens: int = 500) -> Optional[str]:
-                print(f"Warning: Feed parsing issues for {feed_url}")
+        """Make a request to Groq API"""
-                if hasattr(feed, 'bozo_exception'):
+        if not self.available:
-                    print(f"Bozo exception: {feed.bozo_exception}")
+            return None
            articles = []
            source_name = getattr(feed.feed, 'title', urlparse(feed_url).netloc)
            for entry in feed.entries[:self.max_articles]:
                try:
                    # Extract article data
                    title = getattr(entry, 'title', 'No Title')
                    content = getattr(entry, 'summary', getattr(entry, 'description', ''))
                    url = getattr(entry, 'link', '')
                    published = getattr(entry, 'published', '')
                    # Parse date
                    try:
                        if published:
                            pub_date = datetime(*entry.published_parsed[:6])
                        else:
                            pub_date = datetime.now()
                    except:
                        pub_date = datetime.now()
                    # Create article object
                    article = {
                        "id": self.generate_article_id(title, url),
                        "title": title,
                        "content": self.clean_content(content),
                        "url": url,
                        "source": source_name,
                        "published_date": pub_date.isoformat(),
                        "fetched_date": datetime.now().isoformat(),
                        "categories": getattr(entry, 'tags', []),
                        "slug": title.lower().replace(" ", "-").replace("'", "")[:50]
                    }
                    articles.append(article)
                except Exception as e:
                    print(f"Error processing entry: {e}")
                    continue
            print(f"Fetched {len(articles)} articles from {source_name}")
            # If no articles but feed parsed successfully, it might be due to no new content
            if len(articles) == 0 and not feed.bozo:
                print(f"No new articles found in {source_name} (feed is valid)")
            return articles
        try:
            response = self.client.chat.completions.create(
                messages=[
                    {"role": "system", "content": "You are an expert news analyst. Provide concise, accurate analysis."},
                    {"role": "user", "content": prompt}
                ],
                model=self.model,
                max_tokens=max_tokens,
                temperature=0.3
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
-            print(f"Error fetching RSS feed {feed_url}: {e}")
+            print(f"❌ Groq API error: {e}")
            return None
    def summarize_article(self, article: Dict[str, Any]) -> Dict[str, Any]:
        """Generate AI summary of an article"""
        if not self.available:
            return {"summary": "AI analysis not available", "available": False}
        title = article.get('title', '')
        content = article.get('content', '')
        prompt = f"""
        Analyze this news article and provide a concise summary:
        Title: {title}
        Content: {content[:1000]}...
        Provide:
        1. A 2-sentence summary
        2. 3 key points
        3. Main topic category
        Format as JSON:
        {{
            "summary": "Brief 2-sentence summary",
            "key_points": ["point1", "point2", "point3"],
            "category": "Technology/Business/Science/etc"
        }}
        """
        response = self._make_groq_request(prompt, max_tokens=300)
        if response:
            try:
                analysis = json.loads(response)
                analysis["available"] = True
                analysis["analyzed_at"] = datetime.now().isoformat()
                return analysis
            except json.JSONDecodeError:
                return {
                    "summary": response,
                    "available": True,
                    "analyzed_at": datetime.now().isoformat()
                }
        return {"summary": "Analysis failed", "available": False}
    def extract_keywords(self, article: Dict[str, Any]) -> List[str]:
        """Extract key terms and entities from article"""
        if not self.available:
            return []
-    def fetch_all_news(self) -> List[Dict[str, Any]]:
+        title = article.get('title', '')
-        """Fetch news from all configured RSS feeds"""
+        content = article.get('content', '')
        all_articles = []
-        for feed_url in settings.rss_feeds:
+        prompt = f"""
-            feed_url = feed_url.strip()
+        Extract the most important keywords and entities from this article:
            if feed_url:
                articles = self.fetch_rss_feed(feed_url)
                all_articles.extend(articles)
-        # Remove duplicates based on ID
+        Title: {title}
-        unique_articles = {}
+        Content: {content[:800]}...
        for article in all_articles:
            unique_articles[article['id']] = article
-        final_articles = list(unique_articles.values())
+        Return only a JSON array of 5-8 most relevant keywords:
-        print(f"Total unique articles fetched: {len(final_articles)}")
+        ["keyword1", "keyword2", "keyword3", ...]
        """
-        return final_articles
+        response = self._make_groq_request(prompt, max_tokens=100)
-    def save_articles(self, articles: List[Dict[str, Any]]) -> str:
+        if response:
-        """Save articles to JSON file"""
+            try:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                keywords = json.loads(response)
-        filename = f"news_{timestamp}.json"
+                return keywords if isinstance(keywords, list) else []
            except json.JSONDecodeError:
                # Fallback: extract from response text
                words = response.replace('[', '').replace(']', '').replace('"', '').split(',')
                return [word.strip() for word in words[:8]]
-        # Normalize the path to avoid double backslashes
+        return []
        raw_news_dir = os.path.normpath(self.raw_news_dir)
        filepath = os.path.normpath(os.path.join(raw_news_dir, filename))
-        # Ensure directory exists
+    def analyze_sentiment(self, article: Dict[str, Any]) -> Dict[str, Any]:
-        os.makedirs(raw_news_dir, exist_ok=True)
+        """Analyze sentiment and tone of article"""
        if not self.available:
            return {"sentiment": "neutral", "confidence": 0.0, "available": False}
-        with open(filepath, 'w', encoding='utf-8') as f:
+        title = article.get('title', '')
-            json.dump(articles, f, indent=2, ensure_ascii=False)
+        content = article.get('content', '')
-        print(f"Saved {len(articles)} articles to {filepath}")
+        prompt = f"""
-        return filepath
+        Analyze the sentiment and tone of this news article:
-    def fetch_and_save_news(self) -> Dict[str, Any]:
+        Title: {title}
-        """Fetch news and save to file"""
+        Content: {content[:600]}...
        articles = self.fetch_all_news()
-        if articles:
+        Return JSON with:
-            filepath = self.save_articles(articles)
+        {{
-            return {
+            "sentiment": "positive/negative/neutral",
-                "success": True,
+            "confidence": 0.85,
-                "articles_count": len(articles),
+            "tone": "informative/urgent/optimistic/concerned/etc",
-                "filepath": filepath,
+            "reasoning": "Brief explanation"
-                "articles": articles
+        }}
-            }
+        """
        else:
            return {
                "success": False,
                "articles_count": 0,
                "message": "No articles fetched"
            }
-# Test function
+        response = self._make_groq_request(prompt, max_tokens=150)
-if __name__ == "__main__":
+        
-    fetcher = NewsFetcher()
+        if response:
-    result = fetcher.fetch_and_save_news()
+            try:
-    print(f"Result: {result}")
+                sentiment = json.loads(response)
                sentiment["available"] = True
                return sentiment
            except json.JSONDecodeError:
                return {
                    "sentiment": "neutral",
                    "confidence": 0.5,
                    "tone": "informative",
                    "reasoning": response,
                    "available": True
                }
        return {"sentiment": "neutral", "confidence": 0.0, "available": False}
    def generate_insights(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Generate insights from multiple articles"""
        if not self.available or not articles:
            return {"insights": "AI insights not available", "available": False}
        # Prepare article summaries
        article_summaries = []
        for i, article in enumerate(articles[:5]):  # Limit to 5 articles
            title = article.get('title', '')
            source = article.get('source', '')
            article_summaries.append(f"{i+1}. {title} (Source: {source})")
        prompt = f"""
        Analyze these recent news articles and provide insights:
        Articles:
        {chr(10).join(article_summaries)}
        Provide:
        1. Main trends or themes
        2. Key developments
        3. Potential implications
        Format as JSON:
        {{
            "trends": ["trend1", "trend2"],
            "key_developments": ["development1", "development2"],
            "implications": "Brief analysis of what this means"
        }}
        """
        response = self._make_groq_request(prompt, max_tokens=400)
        if response:
            try:
                insights = json.loads(response)
                insights["available"] = True
                insights["analyzed_at"] = datetime.now().isoformat()
                insights["article_count"] = len(articles)
                return insights
            except json.JSONDecodeError:
                return {
                    "insights": response,
                    "available": True,
                    "analyzed_at": datetime.now().isoformat()
                }
        return {"insights": "Analysis failed", "available": False}
    def get_status(self) -> Dict[str, Any]:
        """Get AI analyzer status"""
        return {
            "available": self.available,
            "model": self.model if self.available else None,
            "features": [
                "Article Summarization",
                "Keyword Extraction", 
                "Sentiment Analysis",
                "Trend Insights"
            ] if self.available else []
        }