fix: Restore NewsFetcher class in news_fetcher.py

- Fixed import error by restoring proper NewsFetcher class structure - Updated RSS feed fetching implementation with improved error handling - Enhanced feed parsing with better timeout management and user agents - Maintained compatibility with existing system architecture - Resolved server startup issues caused by missing class definition
2025-07-15 21:55:43 +01:00
parent 508270e732
commit bccb7f2c2c
1 changed files with 200 additions and 214 deletions
@@ -1,230 +1,216 @@
-"""AI Analysis module for DS Task AI News using Groq LLM"""
+
-import os
+"""RSS News Fetcher for DS Task AI News"""
-from typing import Dict, List, Any, Optional
+import feedparser
 import requests
 import json
 import os
 from datetime import datetime
-
+from typing import List, Dict, Any
-try:
+from urllib.parse import urlparse
-    from groq import Groq
+import hashlib
    GROQ_AVAILABLE = True
 except ImportError:
    GROQ_AVAILABLE = False
    print("⚠️  Groq not available - install with: pip install groq")
 from config import settings
 from recommender import NewsRecommender  # Add this import
 from ai_analyzer import AIAnalyzer  # Add this import
-class AIAnalyzer:
+class NewsFetcher:
    """AI-powered article analysis using Groq LLM"""
    def __init__(self):
-        self.client = None
+        self.raw_news_dir = settings.raw_news_dir
-        self.model = "llama3-8b-8192"  # Fast Groq model
+        self.max_articles = settings.max_articles_per_feed
-        self.available = False
+        self.recommender = NewsRecommender()  # Add recommender for embedding/vector access
        self.ai_analyzer = AIAnalyzer()  # Add AIAnalyzer for LLM duplicate check
        # Ensure directories exist
        os.makedirs(self.raw_news_dir, exist_ok=True)
-        if GROQ_AVAILABLE and settings.groq_api_key:
+    def generate_article_id(self, title: str, url: str) -> str:
-            try:
+        """Generate unique ID for article"""
-                self.client = Groq(api_key=settings.groq_api_key)
+        content = f"{title}{url}"
-                self.available = True
+        return hashlib.md5(content.encode()).hexdigest()[:12]
                print("✅ Groq AI Analyzer initialized successfully")
            except Exception as e:
                print(f"❌ Groq initialization failed: {e}")
        else:
            print("⚠️  Groq AI Analyzer not available (missing API key or library)")
-    def _make_groq_request(self, prompt: str, max_tokens: int = 500) -> Optional[str]:
+    def clean_content(self, content: str) -> str:
-        """Make a request to Groq API"""
+        """Clean and truncate content"""
-        if not self.available:
+        if not content:
-            return None
+            return ""
-        try:
+        # Remove HTML tags (basic cleaning)
-            response = self.client.chat.completions.create(
+        import re
-                messages=[
+        content = re.sub(r'<[^>]+>', '', content)
-                    {"role": "system", "content": "You are an expert news analyst. Provide concise, accurate analysis."},
+        
-                    {"role": "user", "content": prompt}
+        # Truncate to reasonable length
-                ],
+        return content[:1000] if len(content) > 1000 else content
-                model=self.model,
+    
-                max_tokens=max_tokens,
+    def is_duplicate_by_llm(self, article: Dict[str, Any], existing_article: Dict[str, Any]) -> bool:
-                temperature=0.3
+        """Use LLM to check if two articles are about the same event or story"""
        if not self.ai_analyzer.available:
            return False  # LLM not available, skip this check
        prompt = f"""
        Are these two news articles about the same event or story? Answer only 'yes' or 'no'.\n\nArticle 1:\nTitle: {article.get('title', '')}\nContent: {article.get('content', '')[:500]}\n\nArticle 2:\nTitle: {existing_article.get('title', '')}\nContent: {existing_article.get('content', '')[:500]}\n"""
        response = self.ai_analyzer._make_groq_request(prompt, max_tokens=5)
        if response and response.strip().lower().startswith('yes'):
            return True
        return False
    def is_duplicate_by_similarity(self, article: Dict[str, Any], threshold: float = 0.9) -> bool:
        """Check if the article is a duplicate using similarity search and LLM verification"""
        all_articles = self.recommender.vector_store.get_all_articles()
        if not all_articles:
            return False  # No articles to compare with
        embedding = self.recommender.embedding_generator.generate_query_embedding(
            self.recommender.embedding_generator.create_article_text(article)
        )
-            return response.choices[0].message.content.strip()
+        existing_embeddings = self.recommender.vector_store.index.reconstruct_n(0, len(all_articles))
        import numpy as np
        for idx, existing_embedding in enumerate(existing_embeddings):
            norm1 = np.linalg.norm(embedding)
            norm2 = np.linalg.norm(existing_embedding)
            if norm1 == 0 or norm2 == 0:
                continue
            similarity = float(np.dot(embedding, existing_embedding) / (norm1 * norm2))
            if similarity >= threshold:
                # Use LLM to confirm duplicate
                existing_article = all_articles[idx]
                if self.is_duplicate_by_llm(article, existing_article):
                    return True  # LLM confirms duplicate
        return False
    def fetch_rss_feed(self, feed_url: str) -> List[Dict[str, Any]]:
        """Fetch articles from a single RSS feed"""
        try:
            print(f"Fetching from: {feed_url}")
            # Use requests with proper headers and timeout
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            try:
                import requests
                response = requests.get(feed_url, headers=headers, timeout=15)
                response.raise_for_status()
                feed = feedparser.parse(response.content)
            except Exception as e:
-            print(f"❌ Groq API error: {e}")
+                print(f"HTTP request failed, trying direct feedparser: {e}")
-            return None
+                feed = feedparser.parse(feed_url)
-    def summarize_article(self, article: Dict[str, Any]) -> Dict[str, Any]:
+            if feed.bozo:
-        """Generate AI summary of an article"""
+                print(f"Warning: Feed parsing issues for {feed_url}")
-        if not self.available:
+                if hasattr(feed, 'bozo_exception'):
-            return {"summary": "AI analysis not available", "available": False}
+                    print(f"Bozo exception: {feed.bozo_exception}")
-        title = article.get('title', '')
+            articles = []
-        content = article.get('content', '')
+            source_name = getattr(feed.feed, 'title', urlparse(feed_url).netloc)
-        prompt = f"""
+            for entry in feed.entries[:self.max_articles]:
        Analyze this news article and provide a concise summary:
        Title: {title}
        Content: {content[:1000]}...
        Provide:
        1. A 2-sentence summary
        2. 3 key points
        3. Main topic category
        Format as JSON:
        {{
            "summary": "Brief 2-sentence summary",
            "key_points": ["point1", "point2", "point3"],
            "category": "Technology/Business/Science/etc"
        }}
        """
        response = self._make_groq_request(prompt, max_tokens=300)
        if response:
                try:
-                analysis = json.loads(response)
+                    # Extract article data
-                analysis["available"] = True
+                    title = getattr(entry, 'title', 'No Title')
-                analysis["analyzed_at"] = datetime.now().isoformat()
+                    content = getattr(entry, 'summary', getattr(entry, 'description', ''))
-                return analysis
+                    url = getattr(entry, 'link', '')
-            except json.JSONDecodeError:
+                    published = getattr(entry, 'published', '')
-                return {
+                    
-                    "summary": response,
+                    # Parse date
-                    "available": True,
+                    try:
-                    "analyzed_at": datetime.now().isoformat()
+                        if published:
                            pub_date = datetime(*entry.published_parsed[:6])
                        else:
                            pub_date = datetime.now()
                    except:
                        pub_date = datetime.now()
                    # Create article object
                    article = {
                        "id": self.generate_article_id(title, url),
                        "title": title,
                        "content": self.clean_content(content),
                        "url": url,
                        "source": source_name,
                        "published_date": pub_date.isoformat(),
                        "fetched_date": datetime.now().isoformat(),
                        "categories": getattr(entry, 'tags', []),
                        "slug": title.lower().replace(" ", "-").replace("'", "")[:50]
                    }
-        return {"summary": "Analysis failed", "available": False}
+                    # Check for duplicate using similarity search
                    if self.is_duplicate_by_similarity(article):
                        print(f"Skipped duplicate article (similarity): {title}")
                        continue
-    def extract_keywords(self, article: Dict[str, Any]) -> List[str]:
+                    articles.append(article)
-        """Extract key terms and entities from article"""
+                    
-        if not self.available:
+                except Exception as e:
                    print(f"Error processing entry: {e}")
                    continue
            print(f"Fetched {len(articles)} articles from {source_name}")
            # If no articles but feed parsed successfully, it might be due to no new content
            if len(articles) == 0 and not feed.bozo:
                print(f"No new articles found in {source_name} (feed is valid)")
            return articles
        except Exception as e:
            print(f"Error fetching RSS feed {feed_url}: {e}")
            return []
-        title = article.get('title', '')
+    def fetch_all_news(self) -> List[Dict[str, Any]]:
-        content = article.get('content', '')
+        """Fetch news from all configured RSS feeds"""
        all_articles = []
-        prompt = f"""
+        for feed_url in settings.rss_feeds:
-        Extract the most important keywords and entities from this article:
+            feed_url = feed_url.strip()
            if feed_url:
                articles = self.fetch_rss_feed(feed_url)
                all_articles.extend(articles)
-        Title: {title}
+        # Remove duplicates based on ID
-        Content: {content[:800]}...
+        unique_articles = {}
        for article in all_articles:
            unique_articles[article['id']] = article
-        Return only a JSON array of 5-8 most relevant keywords:
+        final_articles = list(unique_articles.values())
-        ["keyword1", "keyword2", "keyword3", ...]
+        print(f"Total unique articles fetched: {len(final_articles)}")
        """
-        response = self._make_groq_request(prompt, max_tokens=100)
+        return final_articles
-        if response:
+    def save_articles(self, articles: List[Dict[str, Any]]) -> str:
-            try:
+        """Save articles to JSON file"""
-                keywords = json.loads(response)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-                return keywords if isinstance(keywords, list) else []
+        filename = f"news_{timestamp}.json"
            except json.JSONDecodeError:
                # Fallback: extract from response text
                words = response.replace('[', '').replace(']', '').replace('"', '').split(',')
                return [word.strip() for word in words[:8]]
-        return []
+        # Normalize the path to avoid double backslashes
        raw_news_dir = os.path.normpath(self.raw_news_dir)
        filepath = os.path.normpath(os.path.join(raw_news_dir, filename))
-    def analyze_sentiment(self, article: Dict[str, Any]) -> Dict[str, Any]:
+        # Ensure directory exists
-        """Analyze sentiment and tone of article"""
+        os.makedirs(raw_news_dir, exist_ok=True)
        if not self.available:
            return {"sentiment": "neutral", "confidence": 0.0, "available": False}
-        title = article.get('title', '')
+        with open(filepath, 'w', encoding='utf-8') as f:
-        content = article.get('content', '')
+            json.dump(articles, f, indent=2, ensure_ascii=False)
-        prompt = f"""
+        print(f"Saved {len(articles)} articles to {filepath}")
-        Analyze the sentiment and tone of this news article:
+        return filepath
-        Title: {title}
+    def fetch_and_save_news(self) -> Dict[str, Any]:
-        Content: {content[:600]}...
+        """Fetch news and save to file"""
        articles = self.fetch_all_news()
-        Return JSON with:
+        if articles:
-        {{
+            filepath = self.save_articles(articles)
            "sentiment": "positive/negative/neutral",
            "confidence": 0.85,
            "tone": "informative/urgent/optimistic/concerned/etc",
            "reasoning": "Brief explanation"
        }}
        """
        response = self._make_groq_request(prompt, max_tokens=150)
        if response:
            try:
                sentiment = json.loads(response)
                sentiment["available"] = True
                return sentiment
            except json.JSONDecodeError:
            return {
-                    "sentiment": "neutral",
+                "success": True,
-                    "confidence": 0.5,
+                "articles_count": len(articles),
-                    "tone": "informative",
+                "filepath": filepath,
-                    "reasoning": response,
+                "articles": articles
-                    "available": True
+            }
        else:
            return {
                "success": False,
                "articles_count": 0,
                "message": "No articles fetched"
            }
-        return {"sentiment": "neutral", "confidence": 0.0, "available": False}
+# Test function
-    
+if __name__ == "__main__":
-    def generate_insights(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
+    fetcher = NewsFetcher()
-        """Generate insights from multiple articles"""
+    result = fetcher.fetch_and_save_news()
-        if not self.available or not articles:
+    print(f"Result: {result}")
            return {"insights": "AI insights not available", "available": False}
        # Prepare article summaries
        article_summaries = []
        for i, article in enumerate(articles[:5]):  # Limit to 5 articles
            title = article.get('title', '')
            source = article.get('source', '')
            article_summaries.append(f"{i+1}. {title} (Source: {source})")
        prompt = f"""
        Analyze these recent news articles and provide insights:
        Articles:
        {chr(10).join(article_summaries)}
        Provide:
        1. Main trends or themes
        2. Key developments
        3. Potential implications
        Format as JSON:
        {{
            "trends": ["trend1", "trend2"],
            "key_developments": ["development1", "development2"],
            "implications": "Brief analysis of what this means"
        }}
        """
        response = self._make_groq_request(prompt, max_tokens=400)
        if response:
            try:
                insights = json.loads(response)
                insights["available"] = True
                insights["analyzed_at"] = datetime.now().isoformat()
                insights["article_count"] = len(articles)
                return insights
            except json.JSONDecodeError:
                return {
                    "insights": response,
                    "available": True,
                    "analyzed_at": datetime.now().isoformat()
                }
        return {"insights": "Analysis failed", "available": False}
    def get_status(self) -> Dict[str, Any]:
        """Get AI analyzer status"""
        return {
            "available": self.available,
            "model": self.model if self.available else None,
            "features": [
                "Article Summarization",
                "Keyword Extraction", 
                "Sentiment Analysis",
                "Trend Insights"
            ] if self.available else []
        }