""" Extract stock listings from TSX, TSXV, CSE, and CBOE Uses Playwright to handle JavaScript-rendered content """ import asyncio import json import os from datetime import datetime from playwright.async_api import async_playwright import time class StockListingExtractor: def __init__(self, output_dir="data/listings"): self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) async def extract_tsx_tsxv_listings(self): """Extract TSX and TSXV listings""" print("šŸ” Extracting TSX/TSXV listings...") async with async_playwright() as p: browser = await p.chromium.launch(headless=False) page = await browser.new_page() try: # Navigate to the listing page await page.goto('https://www.tsx.com/listings/listing-with-us/listed-company-directory', wait_until='networkidle', timeout=90000) # Wait for the table to load print("ā³ Waiting for table to load...") await page.wait_for_selector('table, .company-list, [class*="listing"]', timeout=45000) await asyncio.sleep(8) # Extra wait for dynamic content # Try multiple possible selectors listings = [] # Method 1: Try to find a data table table_rows = await page.query_selector_all('tbody tr') if table_rows: print(f"āœ… Found {len(table_rows)} rows in table") for row in table_rows: try: cells = await row.query_selector_all('td') if len(cells) >= 3: symbol = await cells[0].inner_text() name = await cells[1].inner_text() sector = await cells[2].inner_text() if len(cells) > 2 else "" # Clean ticker symbols - remove newlines and extra whitespace symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ') name_clean = name.strip().replace('\n', ' ').replace('\r', ' ') sector_clean = sector.strip().replace('\n', ' ').replace('\r', ' ') listings.append({ 'symbol': symbol_clean, 'name': name_clean, 'sector': sector_clean, 'exchange': 'TSX/TSXV', 'country': 'Canada', 'extracted_at': datetime.now().isoformat() }) except Exception as e: print(f"Error parsing row: {e}") continue # Save HTML for manual inspection if needed html_content = await page.content() with open(f"{self.output_dir}/tsx_page.html", 'w', encoding='utf-8') as f: f.write(html_content) # Save listings if listings: output_file = f"{self.output_dir}/tsx_tsxv_listings.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(listings, f, indent=2) print(f"āœ… Saved {len(listings)} TSX/TSXV listings to {output_file}") else: print("āš ļø No listings found. Check tsx_page.html for debugging") except Exception as e: print(f"āŒ Error extracting TSX/TSXV: {e}") finally: await browser.close() return listings async def extract_cse_listings(self): """Extract CSE listings""" print("\nšŸ” Extracting CSE listings...") async with async_playwright() as p: browser = await p.chromium.launch(headless=False) page = await browser.new_page() try: await page.goto('https://thecse.com/en/listings', wait_until='networkidle', timeout=90000) print("ā³ Waiting for listings to load...") await page.wait_for_selector('table, [class*="listing"], [class*="company"]', timeout=45000) await asyncio.sleep(8) listings = [] # Try to find the listings table table_rows = await page.query_selector_all('tbody tr, [role="row"]') if table_rows: print(f"āœ… Found {len(table_rows)} rows") for row in table_rows: try: # Try different cell selectors cells = await row.query_selector_all('td, [role="cell"]') if len(cells) >= 2: symbol = await cells[0].inner_text() name = await cells[1].inner_text() industry = await cells[2].inner_text() if len(cells) > 2 else "" # Clean ticker symbols - remove newlines and extra whitespace symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ') name_clean = name.strip().replace('\n', ' ').replace('\r', ' ') industry_clean = industry.strip().replace('\n', ' ').replace('\r', ' ') listings.append({ 'symbol': symbol_clean, 'name': name_clean, 'industry': industry_clean, 'exchange': 'CSE', 'country': 'Canada', 'extracted_at': datetime.now().isoformat() }) except Exception as e: continue # Save HTML for debugging html_content = await page.content() with open(f"{self.output_dir}/cse_page.html", 'w', encoding='utf-8') as f: f.write(html_content) # Save listings if listings: output_file = f"{self.output_dir}/cse_listings.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(listings, f, indent=2) print(f"āœ… Saved {len(listings)} CSE listings to {output_file}") else: print("āš ļø No listings found. Check cse_page.html for debugging") except Exception as e: print(f"āŒ Error extracting CSE: {e}") finally: await browser.close() return listings async def extract_cboe_listings(self): """Extract CBOE listings""" print("\nšŸ” Extracting CBOE listings...") async with async_playwright() as p: browser = await p.chromium.launch(headless=False) page = await browser.new_page() try: await page.goto('https://www.cboe.com/us/equities/listings/', wait_until='networkidle', timeout=90000) print("ā³ Waiting for listings to load...") await asyncio.sleep(8) listings = [] # Look for any tables or lists table_rows = await page.query_selector_all('tbody tr, [class*="listing"]') if table_rows: print(f"āœ… Found {len(table_rows)} potential listings") for row in table_rows: try: cells = await row.query_selector_all('td') if len(cells) >= 2: symbol = await cells[0].inner_text() name = await cells[1].inner_text() # Clean ticker symbols - remove newlines and extra whitespace symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ') name_clean = name.strip().replace('\n', ' ').replace('\r', ' ') listings.append({ 'symbol': symbol_clean, 'name': name_clean, 'exchange': 'CBOE', 'country': 'USA', 'extracted_at': datetime.now().isoformat() }) except Exception as e: continue # Save HTML html_content = await page.content() with open(f"{self.output_dir}/cboe_page.html", 'w', encoding='utf-8') as f: f.write(html_content) # Save listings if listings: output_file = f"{self.output_dir}/cboe_listings.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(listings, f, indent=2) print(f"āœ… Saved {len(listings)} CBOE listings to {output_file}") else: print("āš ļø No listings found. Check cboe_page.html for debugging") except Exception as e: print(f"āŒ Error extracting CBOE: {e}") finally: await browser.close() return listings async def extract_all(self): """Extract from all exchanges""" print("=" * 60) print("STOCK LISTING EXTRACTION") print("=" * 60) all_listings = {} # Extract from each exchange all_listings['tsx_tsxv'] = await self.extract_tsx_tsxv_listings() all_listings['cse'] = await self.extract_cse_listings() all_listings['cboe'] = await self.extract_cboe_listings() # Combine all listings combined = [] for exchange, listings in all_listings.items(): combined.extend(listings) # Save combined file if combined: output_file = f"{self.output_dir}/all_listings_combined.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(combined, f, indent=2) print("\n" + "=" * 60) print(f"āœ… TOTAL EXTRACTED: {len(combined)} stocks") print(f"šŸ“ Saved to: {output_file}") print("=" * 60) else: print("\nāš ļø No listings extracted. Check the HTML files for debugging.") return combined async def main(): extractor = StockListingExtractor() listings = await extractor.extract_all() if listings: print(f"\nšŸ“Š Sample of extracted stocks:") for stock in listings[:5]: print(f" - {stock['symbol']}: {stock['name']} ({stock['exchange']})") if __name__ == "__main__": asyncio.run(main())