feat: Implement stock listing extraction and database population

- Added `extract_listings.py` for extracting stock listings from TSX, TSXV, CSE, and CBOE using Playwright. - Created `main.py` to orchestrate the entire stock intelligence system, including extraction, database import, financial scraping, news scraping, and report generation. - Developed `populate_database.py` to populate the database with existing JSON data. - Introduced `scrape_nasdaq_tsx_only.py` for focused scraping of NASDAQ and TSX stocks. - Added `setup.py` for initial setup and testing of the system. - Created `watchlist.txt` template for user-defined stock tracking. - Generated `final_test_output.txt` to log the results of the test run.
2025-11-06 12:34:01 +01:00
parent 389a01cb0a
commit 80ee708348
39 changed files with 8513 additions and 0 deletions
@@ -0,0 +1,235 @@
+"""
+Populate database with existing JSON data
+This script reads all existing JSON files and inserts data into the database
+"""
+
+import os
+import json
+from datetime import datetime
+from database import StockDatabase
+
+
+def populate_from_existing_data():
+    """Read all existing JSON files and populate database"""
+    print("\n" + "=" * 70)
+    print("POPULATING DATABASE FROM EXISTING JSON FILES")
+    print("=" * 70)
+    
+    db = StockDatabase()
+    
+    stats = {
+        'metrics': 0,
+        'news': 0,
+        'filings': 0
+    }
+    
+    # 1. Import calculated metrics
+    print("\n📊 Importing financial metrics...")
+    metrics_dir = "data/metrics"
+    if os.path.exists(metrics_dir):
+        for filename in os.listdir(metrics_dir):
+            if filename.endswith('_calculated_metrics.json'):
+                ticker = filename.replace('_calculated_metrics.json', '')
+                filepath = os.path.join(metrics_dir, filename)
+                
+                try:
+                    with open(filepath, 'r') as f:
+                        metrics = json.load(f)
+                    
+                    # Insert metrics into database
+                    current_year = datetime.now().year
+                    success = db.insert_financial_metrics(ticker, current_year, metrics, is_ttm=True)
+                    
+                    if success:
+                        stats['metrics'] += 1
+                        print(f"   ✓ {ticker}: {len(metrics)} metrics")
+                    
+                except Exception as e:
+                    print(f"   ✗ {ticker}: {e}")
+    
+    # 2. Import news articles (from both regular scraping and SerpAPI)
+    print("\n📰 Importing news articles...")
+    
+    # Regular news
+    news_dir = "data/news"
+    if os.path.exists(news_dir):
+        for filename in os.listdir(news_dir):
+            if filename.endswith('_news_pr.json'):
+                ticker = filename.replace('_news_pr.json', '')
+                filepath = os.path.join(news_dir, filename)
+                
+                try:
+                    with open(filepath, 'r') as f:
+                        data = json.load(f)
+                    
+                    # Insert news articles
+                    articles = data.get('news_articles', [])
+                    for article in articles:
+                        success = db.insert_news_article(
+                            ticker=ticker,
+                            title=article.get('title', ''),
+                            source=article.get('source', ''),
+                            published_date=article.get('date', ''),
+                            url=article.get('url', ''),
+                            snippet=article.get('snippet', '')
+                        )
+                        if success:
+                            stats['news'] += 1
+                    
+                    # Insert press releases
+                    prs = data.get('press_releases', [])
+                    for pr in prs:
+                        success = db.insert_news_article(
+                            ticker=ticker,
+                            title=pr.get('title', ''),
+                            source=pr.get('source', 'Press Release'),
+                            published_date=pr.get('date', ''),
+                            url=pr.get('url', ''),
+                            snippet=pr.get('snippet', '')
+                        )
+                        if success:
+                            stats['news'] += 1
+                    
+                    if articles or prs:
+                        print(f"   ✓ {ticker}: {len(articles)} articles, {len(prs)} PRs")
+                    
+                except Exception as e:
+                    print(f"   ✗ {ticker}: {e}")
+    
+    # SerpAPI news
+    serpapi_dir = "data/serpapi_news"
+    if os.path.exists(serpapi_dir):
+        for filename in os.listdir(serpapi_dir):
+            if filename.endswith('_serpapi.json'):
+                ticker = filename.replace('_serpapi.json', '')
+                filepath = os.path.join(serpapi_dir, filename)
+                
+                try:
+                    with open(filepath, 'r') as f:
+                        data = json.load(f)
+                    
+                    # Insert news articles
+                    articles = data.get('news_articles', [])
+                    for article in articles:
+                        success = db.insert_news_article(
+                            ticker=ticker,
+                            title=article.get('title', ''),
+                            source=article.get('source', ''),
+                            published_date=article.get('date', ''),
+                            url=article.get('link', ''),
+                            snippet=article.get('snippet', '')
+                        )
+                        if success:
+                            stats['news'] += 1
+                    
+                    # Insert press releases
+                    prs = data.get('press_releases', [])
+                    for pr in prs:
+                        success = db.insert_news_article(
+                            ticker=ticker,
+                            title=pr.get('title', ''),
+                            source=pr.get('source', 'Press Release'),
+                            published_date=pr.get('date', ''),
+                            url=pr.get('link', ''),
+                            snippet=pr.get('snippet', '')
+                        )
+                        if success:
+                            stats['news'] += 1
+                    
+                    if articles or prs:
+                        print(f"   ✓ {ticker}: {len(articles)} SerpAPI articles, {len(prs)} PRs")
+                    
+                except Exception as e:
+                    print(f"   ✗ {ticker}: {e}")
+    
+    # 3. Import SEC filings
+    print("\n📄 Importing SEC EDGAR filings...")
+    sec_dir = "data/sec_filings"
+    if os.path.exists(sec_dir):
+        for filename in os.listdir(sec_dir):
+            if filename.endswith('_sec_filings.json'):
+                ticker = filename.replace('_sec_filings.json', '')
+                filepath = os.path.join(sec_dir, filename)
+                
+                try:
+                    with open(filepath, 'r') as f:
+                        data = json.load(f)
+                    
+                    # Insert filings
+                    filings = data.get('filings', [])
+                    for filing in filings:
+                        db.insert_filing(
+                            ticker=ticker,
+                            filing_date=filing.get('filing_date', ''),
+                            filing_type=filing.get('form_type', ''),
+                            title=filing.get('description', ''),
+                            document_url=filing.get('url', ''),
+                            source='SEC EDGAR'
+                        )
+                        stats['filings'] += 1
+                    
+                    # Insert ownership forms
+                    ownership = data.get('insider_ownership', [])
+                    for form in ownership:
+                        db.insert_filing(
+                            ticker=ticker,
+                            filing_date=form.get('filing_date', ''),
+                            filing_type=form.get('form_type', ''),
+                            title=f"Insider Transaction - {form.get('owner', '')}",
+                            document_url=form.get('url', ''),
+                            source='SEC EDGAR - Ownership'
+                        )
+                        stats['filings'] += 1
+                    
+                    if filings or ownership:
+                        print(f"   ✓ {ticker}: {len(filings)} filings, {len(ownership)} ownership")
+                    
+                except Exception as e:
+                    print(f"   ✗ {ticker}: {e}")
+    
+    # 4. Import SEDAR+ filings
+    print("\n📄 Importing SEDAR+ filings...")
+    sedar_dir = "data/sedar_filings"
+    if os.path.exists(sedar_dir):
+        for filename in os.listdir(sedar_dir):
+            if filename.endswith('_sedar_data.json'):
+                ticker = filename.replace('_sedar_data.json', '')
+                filepath = os.path.join(sedar_dir, filename)
+                
+                try:
+                    with open(filepath, 'r') as f:
+                        data = json.load(f)
+                    
+                    # Insert filings
+                    filings = data.get('filings', [])
+                    for filing in filings:
+                        db.insert_filing(
+                            ticker=ticker,
+                            filing_date=filing.get('date', ''),
+                            filing_type=filing.get('type', ''),
+                            title=filing.get('title', ''),
+                            document_url=filing.get('url', ''),
+                            source='SEDAR+'
+                        )
+                        stats['filings'] += 1
+                    
+                    if filings:
+                        print(f"   ✓ {ticker}: {len(filings)} SEDAR+ filings")
+                    
+                except Exception as e:
+                    print(f"   ✗ {ticker}: {e}")
+    
+    # Print final stats
+    print("\n" + "=" * 70)
+    print("DATABASE POPULATION COMPLETE")
+    print("=" * 70)
+    print(f"Financial metrics inserted: {stats['metrics']}")
+    print(f"News articles inserted: {stats['news']}")
+    print(f"Filings inserted: {stats['filings']}")
+    print("=" * 70)
+    
+    db.close()
+
+
+if __name__ == "__main__":
+    populate_from_existing_data()