feat: Implement stock listing extraction and database population

- Added `extract_listings.py` for extracting stock listings from TSX, TSXV, CSE, and CBOE using Playwright. - Created `main.py` to orchestrate the entire stock intelligence system, including extraction, database import, financial scraping, news scraping, and report generation. - Developed `populate_database.py` to populate the database with existing JSON data. - Introduced `scrape_nasdaq_tsx_only.py` for focused scraping of NASDAQ and TSX stocks. - Added `setup.py` for initial setup and testing of the system. - Created `watchlist.txt` template for user-defined stock tracking. - Generated `final_test_output.txt` to log the results of the test run.
2025-11-06 12:34:01 +01:00
parent 389a01cb0a
commit 80ee708348
39 changed files with 8513 additions and 0 deletions
@@ -0,0 +1,275 @@
+"""
+Example analysis script - What you can do with the collected data
+"""
+
+import sqlite3
+import json
+import os
+from collections import defaultdict
+
+class StockAnalyzer:
+    def __init__(self, db_path="data/stocks.db"):
+        self.conn = sqlite3.connect(db_path)
+        self.cursor = self.conn.cursor()
+    
+    def get_summary_stats(self):
+        """Get overall statistics"""
+        print("\n" + "=" * 70)
+        print("DATABASE SUMMARY STATISTICS")
+        print("=" * 70)
+        
+        # Total stocks
+        self.cursor.execute("SELECT COUNT(*) FROM stocks_master")
+        total_stocks = self.cursor.fetchone()[0]
+        print(f"\nTotal stocks tracked: {total_stocks}")
+        
+        # By exchange
+        self.cursor.execute("""
+            SELECT exchange, COUNT(*) as count 
+            FROM stocks_master 
+            GROUP BY exchange 
+            ORDER BY count DESC
+        """)
+        print("\nStocks by exchange:")
+        for exchange, count in self.cursor.fetchall():
+            print(f"  {exchange}: {count}")
+        
+        # By sector (if available)
+        self.cursor.execute("""
+            SELECT sector, COUNT(*) as count 
+            FROM stocks_master 
+            WHERE sector IS NOT NULL AND sector != ''
+            GROUP BY sector 
+            ORDER BY count DESC 
+            LIMIT 10
+        """)
+        sectors = self.cursor.fetchall()
+        if sectors:
+            print("\nTop 10 sectors:")
+            for sector, count in sectors:
+                print(f"  {sector}: {count}")
+        
+        # Coverage stats
+        self.cursor.execute("""
+            SELECT 
+                SUM(CASE WHEN has_financials = 1 THEN 1 ELSE 0 END) as with_financials,
+                SUM(CASE WHEN has_news = 1 THEN 1 ELSE 0 END) as with_news,
+                SUM(CASE WHEN has_press_releases = 1 THEN 1 ELSE 0 END) as with_pr,
+                SUM(CASE WHEN has_financials = 1 AND has_news = 1 AND has_press_releases = 1 THEN 1 ELSE 0 END) as complete
+            FROM coverage_report
+        """)
+        fin, news, pr, complete = self.cursor.fetchone()
+        
+        print("\nData coverage:")
+        print(f"  Stocks with financials: {fin}")
+        print(f"  Stocks with news: {news}")
+        print(f"  Stocks with press releases: {pr}")
+        print(f"  Fully covered stocks: {complete}")
+    
+    def find_recent_news_activity(self, limit=20):
+        """Find stocks with most recent news"""
+        print("\n" + "=" * 70)
+        print(f"TOP {limit} STOCKS BY NEWS ACTIVITY")
+        print("=" * 70)
+        
+        # Load news files and count articles
+        news_dir = "data/news"
+        if not os.path.exists(news_dir):
+            print("No news data available yet")
+            return
+        
+        stock_news_count = []
+        
+        for filename in os.listdir(news_dir):
+            if filename.endswith('_news_pr.json'):
+                ticker = filename.replace('_news_pr.json', '')
+                filepath = os.path.join(news_dir, filename)
+                
+                with open(filepath, 'r') as f:
+                    data = json.load(f)
+                    news_count = len(data.get('news_articles', []))
+                    pr_count = len(data.get('press_releases', []))
+                    
+                    if news_count > 0 or pr_count > 0:
+                        stock_news_count.append({
+                            'ticker': ticker,
+                            'news': news_count,
+                            'pr': pr_count,
+                            'total': news_count + pr_count
+                        })
+        
+        # Sort by total
+        stock_news_count.sort(key=lambda x: x['total'], reverse=True)
+        
+        print(f"\n{'Ticker':<10} {'News':<10} {'PR':<10} {'Total':<10}")
+        print("-" * 40)
+        for stock in stock_news_count[:limit]:
+            print(f"{stock['ticker']:<10} {stock['news']:<10} {stock['pr']:<10} {stock['total']:<10}")
+    
+    def find_stocks_by_sector(self, sector):
+        """Find all stocks in a sector"""
+        print("\n" + "=" * 70)
+        print(f"STOCKS IN SECTOR: {sector.upper()}")
+        print("=" * 70)
+        
+        self.cursor.execute("""
+            SELECT symbol, company_name, exchange 
+            FROM stocks_master 
+            WHERE sector LIKE ? 
+            ORDER BY symbol
+        """, (f"%{sector}%",))
+        
+        stocks = self.cursor.fetchall()
+        
+        if stocks:
+            print(f"\nFound {len(stocks)} stocks:")
+            for symbol, name, exchange in stocks:
+                print(f"  {symbol:<8} {name:<50} [{exchange}]")
+        else:
+            print(f"\nNo stocks found in sector: {sector}")
+    
+    def get_stock_report(self, ticker):
+        """Get full report for a stock"""
+        print("\n" + "=" * 70)
+        print(f"STOCK REPORT: {ticker}")
+        print("=" * 70)
+        
+        # Get basic info
+        self.cursor.execute("""
+            SELECT company_name, exchange, sector, industry, listing_date 
+            FROM stocks_master 
+            WHERE symbol = ?
+        """, (ticker,))
+        
+        result = self.cursor.fetchone()
+        if not result:
+            print(f"\nStock {ticker} not found in database")
+            return
+        
+        name, exchange, sector, industry, listing_date = result
+        
+        print(f"\nCompany: {name}")
+        print(f"Exchange: {exchange}")
+        if sector:
+            print(f"Sector: {sector}")
+        if industry:
+            print(f"Industry: {industry}")
+        if listing_date:
+            print(f"Listing Date: {listing_date}")
+        
+        # Check coverage
+        self.cursor.execute("""
+            SELECT has_financials, has_news, has_press_releases 
+            FROM coverage_report 
+            WHERE ticker = ?
+        """, (ticker,))
+        
+        coverage = self.cursor.fetchone()
+        if coverage:
+            has_fin, has_news, has_pr = coverage
+            print(f"\nData Coverage:")
+            print(f"  Financials: {'✅' if has_fin else '❌'}")
+            print(f"  News: {'✅' if has_news else '❌'}")
+            print(f"  Press Releases: {'✅' if has_pr else '❌'}")
+        
+        # Load financial data if available
+        fin_file = f"data/financials/{ticker}_yahoo.json"
+        if os.path.exists(fin_file):
+            print(f"\nFinancial Data: (see {fin_file})")
+            with open(fin_file, 'r') as f:
+                data = json.load(f)
+                if data.get('profile', {}).get('current_price'):
+                    print(f"  Current Price: ${data['profile']['current_price']}")
+        
+        # Load news if available
+        news_file = f"data/news/{ticker}_news_pr.json"
+        if os.path.exists(news_file):
+            with open(news_file, 'r') as f:
+                data = json.load(f)
+                news_count = len(data.get('news_articles', []))
+                pr_count = len(data.get('press_releases', []))
+                print(f"\nNews & Press Releases:")
+                print(f"  News articles: {news_count}")
+                print(f"  Press releases: {pr_count}")
+                
+                if news_count > 0:
+                    print(f"\n  Recent news:")
+                    for article in data['news_articles'][:3]:
+                        print(f"    - {article.get('title', 'N/A')}")
+        
+        # Check if report exists
+        report_file = f"data/reports/{ticker}_report.txt"
+        if os.path.exists(report_file):
+            print(f"\nFull report available at: {report_file}")
+    
+    def export_to_csv(self, output_file="stock_list.csv"):
+        """Export stock list to CSV"""
+        print("\n" + "=" * 70)
+        print(f"EXPORTING TO CSV: {output_file}")
+        print("=" * 70)
+        
+        self.cursor.execute("""
+            SELECT s.symbol, s.company_name, s.exchange, s.sector, s.industry,
+                   c.has_financials, c.has_news, c.has_press_releases
+            FROM stocks_master s
+            LEFT JOIN coverage_report c ON s.symbol = c.ticker
+            ORDER BY s.symbol
+        """)
+        
+        import csv
+        with open(output_file, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow(['Symbol', 'Company', 'Exchange', 'Sector', 'Industry', 
+                           'Has Financials', 'Has News', 'Has PR'])
+            writer.writerows(self.cursor.fetchall())
+        
+        print(f"\n✅ Exported to {output_file}")
+    
+    def close(self):
+        self.conn.close()
+
+
+def main():
+    """Example usage"""
+    print("\n" + "=" * 70)
+    print("STOCK DATA ANALYSIS - EXAMPLES")
+    print("=" * 70)
+    
+    # Check if database exists
+    if not os.path.exists("data/stocks.db"):
+        print("\n❌ Database not found!")
+        print("   Run 'python main.py' first to collect data")
+        return
+    
+    analyzer = StockAnalyzer()
+    
+    # Example 1: Get summary statistics
+    analyzer.get_summary_stats()
+    
+    # Example 2: Find most active stocks (by news)
+    analyzer.find_recent_news_activity(limit=10)
+    
+    # Example 3: Find stocks in a sector
+    # analyzer.find_stocks_by_sector("Technology")
+    
+    # Example 4: Get report for specific stock
+    # analyzer.get_stock_report("ABC")
+    
+    # Example 5: Export to CSV
+    # analyzer.export_to_csv("my_stocks.csv")
+    
+    analyzer.close()
+    
+    print("\n" + "=" * 70)
+    print("ANALYSIS COMPLETE")
+    print("=" * 70)
+    print("\nYou can modify this script to:")
+    print("  - Filter stocks by criteria (P/E, market cap, etc.)")
+    print("  - Find stocks with specific keywords in news")
+    print("  - Compare stocks within sectors")
+    print("  - Track changes over time")
+    print("  - Generate custom reports")
+
+
+if __name__ == "__main__":
+    main()