feat: Implement stock listing extraction and database population

- Added `extract_listings.py` for extracting stock listings from TSX, TSXV, CSE, and CBOE using Playwright. - Created `main.py` to orchestrate the entire stock intelligence system, including extraction, database import, financial scraping, news scraping, and report generation. - Developed `populate_database.py` to populate the database with existing JSON data. - Introduced `scrape_nasdaq_tsx_only.py` for focused scraping of NASDAQ and TSX stocks. - Added `setup.py` for initial setup and testing of the system. - Created `watchlist.txt` template for user-defined stock tracking. - Generated `final_test_output.txt` to log the results of the test run.
2025-11-06 12:34:01 +01:00
parent 389a01cb0a
commit 80ee708348
39 changed files with 8513 additions and 0 deletions
@@ -0,0 +1,265 @@
+"""
+Extract stock listings from TSX, TSXV, CSE, and CBOE
+Uses Playwright to handle JavaScript-rendered content
+"""
+
+import asyncio
+import json
+import os
+from datetime import datetime
+from playwright.async_api import async_playwright
+import time
+
+class StockListingExtractor:
+    def __init__(self, output_dir="data/listings"):
+        self.output_dir = output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        
+    async def extract_tsx_tsxv_listings(self):
+        """Extract TSX and TSXV listings"""
+        print("🔍 Extracting TSX/TSXV listings...")
+        
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=False)
+            page = await browser.new_page()
+            
+            try:
+                # Navigate to the listing page
+                await page.goto('https://www.tsx.com/listings/listing-with-us/listed-company-directory', 
+                              wait_until='networkidle', timeout=90000)
+                
+                # Wait for the table to load
+                print("⏳ Waiting for table to load...")
+                await page.wait_for_selector('table, .company-list, [class*="listing"]', timeout=45000)
+                await asyncio.sleep(8)  # Extra wait for dynamic content
+                
+                # Try multiple possible selectors
+                listings = []
+                
+                # Method 1: Try to find a data table
+                table_rows = await page.query_selector_all('tbody tr')
+                if table_rows:
+                    print(f"✅ Found {len(table_rows)} rows in table")
+                    for row in table_rows:
+                        try:
+                            cells = await row.query_selector_all('td')
+                            if len(cells) >= 3:
+                                symbol = await cells[0].inner_text()
+                                name = await cells[1].inner_text()
+                                sector = await cells[2].inner_text() if len(cells) > 2 else ""
+                                
+                                # Clean ticker symbols - remove newlines and extra whitespace
+                                symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
+                                name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
+                                sector_clean = sector.strip().replace('\n', ' ').replace('\r', ' ')
+                                
+                                listings.append({
+                                    'symbol': symbol_clean,
+                                    'name': name_clean,
+                                    'sector': sector_clean,
+                                    'exchange': 'TSX/TSXV',
+                                    'country': 'Canada',
+                                    'extracted_at': datetime.now().isoformat()
+                                })
+                        except Exception as e:
+                            print(f"Error parsing row: {e}")
+                            continue
+                
+                # Save HTML for manual inspection if needed
+                html_content = await page.content()
+                with open(f"{self.output_dir}/tsx_page.html", 'w', encoding='utf-8') as f:
+                    f.write(html_content)
+                
+                # Save listings
+                if listings:
+                    output_file = f"{self.output_dir}/tsx_tsxv_listings.json"
+                    with open(output_file, 'w', encoding='utf-8') as f:
+                        json.dump(listings, f, indent=2)
+                    print(f"✅ Saved {len(listings)} TSX/TSXV listings to {output_file}")
+                else:
+                    print("⚠️  No listings found. Check tsx_page.html for debugging")
+                
+            except Exception as e:
+                print(f"❌ Error extracting TSX/TSXV: {e}")
+            finally:
+                await browser.close()
+        
+        return listings
+    
+    async def extract_cse_listings(self):
+        """Extract CSE listings"""
+        print("\n🔍 Extracting CSE listings...")
+        
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=False)
+            page = await browser.new_page()
+            
+            try:
+                await page.goto('https://thecse.com/en/listings', 
+                              wait_until='networkidle', timeout=90000)
+                
+                print("⏳ Waiting for listings to load...")
+                await page.wait_for_selector('table, [class*="listing"], [class*="company"]', timeout=45000)
+                await asyncio.sleep(8)
+                
+                listings = []
+                
+                # Try to find the listings table
+                table_rows = await page.query_selector_all('tbody tr, [role="row"]')
+                if table_rows:
+                    print(f"✅ Found {len(table_rows)} rows")
+                    for row in table_rows:
+                        try:
+                            # Try different cell selectors
+                            cells = await row.query_selector_all('td, [role="cell"]')
+                            if len(cells) >= 2:
+                                symbol = await cells[0].inner_text()
+                                name = await cells[1].inner_text()
+                                industry = await cells[2].inner_text() if len(cells) > 2 else ""
+                                
+                                # Clean ticker symbols - remove newlines and extra whitespace
+                                symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
+                                name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
+                                industry_clean = industry.strip().replace('\n', ' ').replace('\r', ' ')
+                                
+                                listings.append({
+                                    'symbol': symbol_clean,
+                                    'name': name_clean,
+                                    'industry': industry_clean,
+                                    'exchange': 'CSE',
+                                    'country': 'Canada',
+                                    'extracted_at': datetime.now().isoformat()
+                                })
+                        except Exception as e:
+                            continue
+                
+                # Save HTML for debugging
+                html_content = await page.content()
+                with open(f"{self.output_dir}/cse_page.html", 'w', encoding='utf-8') as f:
+                    f.write(html_content)
+                
+                # Save listings
+                if listings:
+                    output_file = f"{self.output_dir}/cse_listings.json"
+                    with open(output_file, 'w', encoding='utf-8') as f:
+                        json.dump(listings, f, indent=2)
+                    print(f"✅ Saved {len(listings)} CSE listings to {output_file}")
+                else:
+                    print("⚠️  No listings found. Check cse_page.html for debugging")
+                
+            except Exception as e:
+                print(f"❌ Error extracting CSE: {e}")
+            finally:
+                await browser.close()
+        
+        return listings
+    
+    async def extract_cboe_listings(self):
+        """Extract CBOE listings"""
+        print("\n🔍 Extracting CBOE listings...")
+        
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=False)
+            page = await browser.new_page()
+            
+            try:
+                await page.goto('https://www.cboe.com/us/equities/listings/', 
+                              wait_until='networkidle', timeout=90000)
+                
+                print("⏳ Waiting for listings to load...")
+                await asyncio.sleep(8)
+                
+                listings = []
+                
+                # Look for any tables or lists
+                table_rows = await page.query_selector_all('tbody tr, [class*="listing"]')
+                if table_rows:
+                    print(f"✅ Found {len(table_rows)} potential listings")
+                    for row in table_rows:
+                        try:
+                            cells = await row.query_selector_all('td')
+                            if len(cells) >= 2:
+                                symbol = await cells[0].inner_text()
+                                name = await cells[1].inner_text()
+                                
+                                # Clean ticker symbols - remove newlines and extra whitespace
+                                symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
+                                name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
+                                
+                                listings.append({
+                                    'symbol': symbol_clean,
+                                    'name': name_clean,
+                                    'exchange': 'CBOE',
+                                    'country': 'USA',
+                                    'extracted_at': datetime.now().isoformat()
+                                })
+                        except Exception as e:
+                            continue
+                
+                # Save HTML
+                html_content = await page.content()
+                with open(f"{self.output_dir}/cboe_page.html", 'w', encoding='utf-8') as f:
+                    f.write(html_content)
+                
+                # Save listings
+                if listings:
+                    output_file = f"{self.output_dir}/cboe_listings.json"
+                    with open(output_file, 'w', encoding='utf-8') as f:
+                        json.dump(listings, f, indent=2)
+                    print(f"✅ Saved {len(listings)} CBOE listings to {output_file}")
+                else:
+                    print("⚠️  No listings found. Check cboe_page.html for debugging")
+                
+            except Exception as e:
+                print(f"❌ Error extracting CBOE: {e}")
+            finally:
+                await browser.close()
+        
+        return listings
+    
+    async def extract_all(self):
+        """Extract from all exchanges"""
+        print("=" * 60)
+        print("STOCK LISTING EXTRACTION")
+        print("=" * 60)
+        
+        all_listings = {}
+        
+        # Extract from each exchange
+        all_listings['tsx_tsxv'] = await self.extract_tsx_tsxv_listings()
+        all_listings['cse'] = await self.extract_cse_listings()
+        all_listings['cboe'] = await self.extract_cboe_listings()
+        
+        # Combine all listings
+        combined = []
+        for exchange, listings in all_listings.items():
+            combined.extend(listings)
+        
+        # Save combined file
+        if combined:
+            output_file = f"{self.output_dir}/all_listings_combined.json"
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(combined, f, indent=2)
+            
+            print("\n" + "=" * 60)
+            print(f"✅ TOTAL EXTRACTED: {len(combined)} stocks")
+            print(f"📁 Saved to: {output_file}")
+            print("=" * 60)
+        else:
+            print("\n⚠️  No listings extracted. Check the HTML files for debugging.")
+        
+        return combined
+
+
+async def main():
+    extractor = StockListingExtractor()
+    listings = await extractor.extract_all()
+    
+    if listings:
+        print(f"\n📊 Sample of extracted stocks:")
+        for stock in listings[:5]:
+            print(f"  - {stock['symbol']}: {stock['name']} ({stock['exchange']})")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())